diff --git a/c_cxx/README.md b/c_cxx/README.md
index 9c1cc9b57..7267fc179 100644
--- a/c_cxx/README.md
+++ b/c_cxx/README.md
@@ -1,4 +1,12 @@
-This directory contains a few C/C++ sample applications for demonstrating onnxruntime usage:
+# ORT Tutorials - C/C++ Samples
+
+There is a suite of C/C++ samples in the [ort_tutorial directory](./ort_tutorial).
+All these samples are aimed to rely fully on cross vendor ONNX Runtime APIs, and should be able to run on any platform. 
+Each sample show cases a specific feature of the ONNX Runtime API and illustrates how and why you should use it. The minimal ONNX Runtime version for these samples is 1.23.0.
+
+## Other samples
+
+This directory contains a few other C/C++ sample applications to demonstrate more specific onnxruntime usage:
 
 1. (Windows and Linux) fns_candy_style_transfer: A C application that uses the FNS-Candy style transfer model to re-style images. It is written purely in C, no C++.
 2. (Windows only) MNIST: A windows GUI application for doing handwriting recognition
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/CMakeLists.txt b/c_cxx/ort_tutorial/10_ep-device-selection/CMakeLists.txt
new file mode 100644
index 000000000..665914603
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.20)
+project(winai-samples)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+include(onnxruntimesetup)
+
+add_executable(ep-device-selection
+    main.cpp
+    utils.cpp
+    lodepng/lodepng.cpp
+    )
+
+set_target_properties(ep-device-selection PROPERTIES
+    CXX_STANDARD 20
+    CXX_EXTENSIONS OFF
+    )
+target_link_libraries(ep-device-selection PRIVATE
+  onnxruntime_interface
+)
+target_include_directories(ep-device-selection PRIVATE
+    lode_png
+)
+
+set(DEPTH_ANYTHING_FP16_ONNX "depth_anything_v2_torch.float16_240.onnx")
+
+copy_file_to_bin_dir(${DEPTH_ANYTHING_FP16_ONNX})
+copy_file_to_bin_dir(Input.png)
+
+target_compile_definitions(ep-device-selection
+    PRIVATE -DMODEL_FILE="${DEPTH_ANYTHING_FP16_ONNX}")
+set_target_properties(ep-device-selection
+    PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    LIBRARY_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    RUNTIME_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+)
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/Input.png b/c_cxx/ort_tutorial/10_ep-device-selection/Input.png
new file mode 100644
index 000000000..3d413cfc2
Binary files /dev/null and b/c_cxx/ort_tutorial/10_ep-device-selection/Input.png differ
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/README.md b/c_cxx/ort_tutorial/10_ep-device-selection/README.md
new file mode 100644
index 000000000..463957541
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/README.md
@@ -0,0 +1,55 @@
+# EP Device Selection
+
+ONNX Runtime provides since version 1.23.0 an execution provider independent way of querying and selecting
+inference devices. This involves typically 3 steps. 
+
+- 1. Registration of execution provider libraries
+```cpp
+  auto env = Ort::Env(ORT_LOGGING_LEVEL_WARNING);
+  env.RegisterExecutionProviderLibrary("openvino", ORT_TSTR("onnxruntime_providers_openvino.dll"));
+  env.RegisterExecutionProviderLibrary("qnn", ORT_TSTR("onnxruntime_providers_qnn.dll"));
+  env.RegisterExecutionProviderLibrary("nv_tensorrt_rtx", ORT_TSTR("onnxruntime_providers_nv_tensorrt_rtx.dll"));
+```
+
+- 2. Querying and selecting Execution Provider (EP) Devices
+
+```cpp
+  auto ep_devices = env.GetEpDevices();
+  auto selected_devices = my_ep_selection_function(ep_devices);
+
+  Ort::SessionOptions session_options;
+  session_options.AppendExecutionProvider_V2(env, selected_devices, ep_options);
+  // Optionally, set device policy. E.g. OrtExecutionProviderDevicePolicy_PREFER_GPU, OrtExecutionProviderDevicePolicy_PREFER_NPU, OrtExecutionProviderDevicePolicy_MAX_PERFORMANCE
+  session_options.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_GPU);
+```
+- 3. Use the session options to create a inference session
+
+```cpp
+  Ort::Session session(env, ORT_TSTR("path/to/model.onnx"), session_options);
+```
+
+
+## Building the sample
+
+`cmake -B build -S . -DONNX_RUNTIME_PATH=path/to/onnxruntime> -DTRTRTX_RUNTIME_PATH=<path/to/TRTRTX/libs> && cmake --build build --config Release`
+
+Then run
+```
+./build/Release/ep-device-selection -i ./Input.png -o ./output.png
+```
+
+Run 
+
+```
+./build/Release/ep-device-selection -h
+```
+to know about more available command line options that influence device selection.
+
+## Model
+
+The ONNX file in this folder was generated using code from https://github.com/DepthAnything/Depth-Anything-V2 (Apache 2.0)
+with weights from https://huggingface.co/depth-anything/Depth-Anything-V2-Small/ (Apache 2.0).
+
+## Dependencies
+
+This sample vendors a copy of https://github.com/lvandeve/lodepng (Zlib license)
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/argparsing.h b/c_cxx/ort_tutorial/10_ep-device-selection/argparsing.h
new file mode 100644
index 000000000..f6ca268c0
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/argparsing.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include "utils.h"
+
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+
+#include <filesystem>
+#include <functional>
+#include <string>
+#include <string_view>
+
+struct Opts {
+  std::string input_image;
+  std::string output_image;
+  std::string select_vendor;
+  std::string select_ep;
+  bool enableEpContext{true};
+  OrtExecutionProviderDevicePolicy ep_device_policy =
+      OrtExecutionProviderDevicePolicy_PREFER_GPU;
+};
+
+struct ArgumentSpec {
+  const char *name;
+  const char *short_name;
+  const char *help;
+  int num_args;
+  std::function<bool(int)> lambda;
+};
+
+static Opts parse_args(int argc, char **argv) {
+  using namespace std::string_view_literals;
+  Opts opts;
+  auto arg_specs = std::array{
+      // clang-format off
+    ArgumentSpec{
+      "--input", "-i", "Path to input image (*.png)", 1, [&](int i) {
+        opts.input_image = argv[i + 1];
+        if (opts.input_image.starts_with("-")) {
+          LOG("Path to input image can't start with -: \"{}\"",
+                 opts.input_image.c_str());
+          return false;
+        }
+        return true;
+      }},
+    ArgumentSpec{
+      "--output", "-o", "Path where to save output image (*.png)", 1, [&](int i) {
+        opts.output_image = argv[i + 1];
+        if (opts.output_image.starts_with("-")) {
+          LOG("Path to output image can't start with -: \"{}\"",
+                 opts.output_image.c_str());
+          return false;
+        }
+        return true;
+      }},
+    ArgumentSpec{
+      "--select-vendor", "-f", "Select device of provided vendor.", 1, [&](int i) {
+        opts.select_vendor = argv[i + 1];
+        if (opts.select_vendor.starts_with("-")) {
+          LOG("Vendor can't start with -: \"{}\"",
+                 opts.select_vendor.c_str());
+          return false;
+        }
+        return true;
+      }},
+    ArgumentSpec{
+      "--select-ep", "-f", "Select devices that support a specific execution provider. "
+        "See https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/graph/constants.h for EP names."
+        , 1, [&](int i) {
+        opts.select_vendor = argv[i + 1];
+        if (opts.select_vendor.starts_with("-")) {
+          LOG("Execution provider can't start with -: \"{}\"",
+                 opts.select_vendor.c_str());
+          return false;
+        }
+        return true;
+      }},
+    ArgumentSpec{
+      "--ep-device-policy", "-p", "Set a EP device policy: e.g. prefer-cpu, prefer-gpu, prefer-npu, max-performance, max-efficiency, min-overall-power", 1, [&](int i) {
+        if(argv[i+1] == "prefer-cpu"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_PREFER_CPU;
+        } else if(argv[i+1] == "prefer-gpu"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_PREFER_GPU;
+        } else if(argv[i+1] == "prefer-npu"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_PREFER_NPU;
+        } else if(argv[i+1] == "max-performance"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_MAX_PERFORMANCE;
+        } else if(argv[i+1] == "max-efficiency"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_MAX_EFFICIENCY;
+        } else if(argv[i+1] == "min-overall-power"sv) {
+          opts.ep_device_policy = OrtExecutionProviderDevicePolicy_MIN_OVERALL_POWER;
+        } else {
+          LOG("Invalid execution provider policy: \"{}\"! Choose among prefer-cpu, prefer-gpu, prefer-npu, max-performance, max-efficiency, min-overall-power", argv[i+1]);
+          return false;
+        }
+        return true;
+      }}
+      // clang-format on
+  };
+  auto print_usage = [&] {
+    LOG("");
+    LOG("Usage:");
+    LOG("{} <options>", argv[0]);
+    for (auto &spec : arg_specs) {
+      if (spec.short_name) {
+        LOG("\t{} {}    {}", spec.name, spec.short_name, spec.help);
+      } else {
+        LOG("\t{}    {}", spec.name, spec.help);
+      }
+    }
+  };
+  for (int i = 1; i < argc; i++) {
+    bool arg_found = false;
+    for (auto &spec : arg_specs) {
+      if (std::strcmp(spec.name, argv[i]) == 0 ||
+          (spec.short_name && std::strcmp(spec.short_name, argv[i]) == 0)) {
+        if (i + spec.num_args < argc) {
+          bool ok = spec.lambda(i);
+          if (!ok) {
+            LOG("Failed to parse arguments for {}!", spec.name);
+            exit(EXIT_FAILURE);
+          }
+          arg_found = true;
+          i += spec.num_args;
+          break;
+        } else {
+          LOG("Not enough arguments for {} specified!", spec.name);
+          exit(EXIT_FAILURE);
+        }
+      }
+    }
+    if (!arg_found) {
+      auto arg = argv[i];
+      LOG("Unknown argument: {}", arg);
+      print_usage();
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (opts.input_image.empty()) {
+    opts.input_image = (get_executable_path().parent_path() / "Input.png").string();
+  }
+  if (opts.output_image.empty()) {
+    opts.output_image = (get_executable_path().parent_path() / "output.png").string();
+  }
+  if (!std::filesystem::is_regular_file(opts.input_image)) {
+    LOG("Please make sure that provided input image path exists: \"{}\"!",
+        opts.input_image.c_str());
+    print_usage();
+    exit(EXIT_FAILURE);
+  }
+  if (!std::filesystem::is_directory(
+          std::filesystem::path(opts.output_image).parent_path())) {
+    LOG("Please make sure that the parent directory of the provided output "
+        "path exists: \"{}\"!",
+        opts.output_image.c_str());
+    print_usage();
+    exit(EXIT_FAILURE);
+  }
+  return opts;
+}
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.cpp b/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.cpp
new file mode 100644
index 000000000..43af8e73c
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.cpp
@@ -0,0 +1,6234 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#include "lodepng.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20170917";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size)
+{
+    return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size)
+{
+    return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr)
+{
+    free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code)\
+{\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code)\
+{\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call)\
+{\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code)\
+{\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector
+{
+    unsigned* data;
+    size_t size; /*size in number of unsigned longs*/
+    size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p)
+{
+    ((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+    lodepng_free(((uivector*)p)->data);
+    ((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size)
+{
+    if (!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value)
+{
+    size_t oldsize = p->size, i;
+    if (!uivector_resize(p, size)) return 0;
+    for (i = oldsize; i < size; ++i) p->data[i] = value;
+    return 1;
+}
+
+static void uivector_init(uivector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c)
+{
+    if (!uivector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector
+{
+    unsigned char* data;
+    size_t size; /*used size*/
+    size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned char*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size)
+{
+    if (!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p)
+{
+    ((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+    lodepng_free(((ucvector*)p)->data);
+    ((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size)
+{
+    p->data = buffer;
+    p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c)
+{
+    if (!ucvector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned string_resize(char** out, size_t size)
+{
+    char* data = (char*)lodepng_realloc(*out, size + 1);
+    if (data)
+    {
+        data[size] = 0; /*null termination char*/
+        *out = data;
+    }
+    return data != 0;
+}
+
+/*init a {char*, size_t} pair for use as string*/
+static void string_init(char** out)
+{
+    *out = NULL;
+    string_resize(out, 0);
+}
+
+/*free the above pair again*/
+static void string_cleanup(char** out)
+{
+    lodepng_free(*out);
+    *out = NULL;
+}
+
+static void string_set(char** out, const char* in)
+{
+    size_t insize = strlen(in), i;
+    if (string_resize(out, insize))
+    {
+        for (i = 0; i != insize; ++i)
+        {
+            (*out)[i] = in[i];
+        }
+    }
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer)
+{
+    return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value)
+{
+    buffer[0] = (unsigned char)((value >> 24) & 0xff);
+    buffer[1] = (unsigned char)((value >> 16) & 0xff);
+    buffer[2] = (unsigned char)((value >> 8) & 0xff);
+    buffer[3] = (unsigned char)((value) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value)
+{
+    ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+    lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename)
+{
+    FILE* file;
+    long size;
+    file = fopen(filename, "rb");
+    if (!file) return -1;
+
+    if (fseek(file, 0, SEEK_END) != 0)
+    {
+        fclose(file);
+        return -1;
+    }
+
+    size = ftell(file);
+    /* It may give LONG_MAX as directory size, this is invalid for us. */
+    if (size == LONG_MAX) size = -1;
+
+    fclose(file);
+    return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename)
+{
+    FILE* file;
+    size_t readsize;
+    file = fopen(filename, "rb");
+    if (!file) return 78;
+
+    readsize = fread(out, 1, size, file);
+    fclose(file);
+
+    if (readsize != size) return 78;
+    return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename)
+{
+    long size = lodepng_filesize(filename);
+    if (size < 0) return 78;
+    *outsize = (size_t)size;
+
+    *out = (unsigned char*)lodepng_malloc((size_t)size);
+    if (!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+    return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename)
+{
+    FILE* file;
+    file = fopen(filename, "wb");
+    if (!file) return 79;
+    fwrite((char*)buffer, 1, buffersize, file);
+    fclose(file);
+    return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit)\
+{\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0, i;
+    for (i = 0; i != nbits; ++i)
+    {
+        result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+        ++(*bitpointer);
+    }
+    return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+= { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+67, 83, 99, 115, 131, 163, 195, 227, 258 };
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+= { 0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+4,  4,  4,   4,   5,   5,   5,   5,   0 };
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+= { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577 };
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+= { 0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13 };
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+= { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree
+{
+    unsigned* tree2d;
+    unsigned* tree1d;
+    unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+    unsigned maxbitlen; /*maximum number of bits a single code can get*/
+    unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree)
+{
+std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+for(size_t i = 0; i != tree->tree1d.size; ++i)
+{
+if(tree->lengths.data[i])
+std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+}
+std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree)
+{
+    tree->tree2d = 0;
+    tree->tree1d = 0;
+    tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree)
+{
+    lodepng_free(tree->tree2d);
+    lodepng_free(tree->tree1d);
+    lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree)
+{
+    unsigned nodefilled = 0; /*up to which node it is filled*/
+    unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+    unsigned n, i;
+
+    tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+    if (!tree->tree2d) return 83; /*alloc fail*/
+
+                                  /*
+                                  convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+                                  uninited, a value >= numcodes is an address to another bit, a value < numcodes
+                                  is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+                                  many columns as codes - 1.
+                                  A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+                                  Here, the internal nodes are stored (what their 0 and 1 option point to).
+                                  There is only memory for such good tree currently, if there are more nodes
+                                  (due to too long length codes), error 55 will happen
+                                  */
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+    }
+
+    for (n = 0; n < tree->numcodes; ++n) /*the codes*/
+    {
+        for (i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/
+        {
+            unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+            /*oversubscribed, see comment in lodepng_error_text*/
+            if (treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+            if (tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/
+            {
+                if (i + 1 == tree->lengths[n]) /*last bit*/
+                {
+                    tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+                    treepos = 0;
+                }
+                else
+                {
+                    /*put address of the next step in here, first that address has to be found of course
+                    (it's just nodefilled + 1)...*/
+                    ++nodefilled;
+                    /*addresses encoded with numcodes added to it*/
+                    tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+                    treepos = nodefilled;
+                }
+            }
+            else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+        }
+    }
+
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        if (tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+    }
+
+    return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree)
+{
+    uivector blcount;
+    uivector nextcode;
+    unsigned error = 0;
+    unsigned bits, n;
+
+    uivector_init(&blcount);
+    uivector_init(&nextcode);
+
+    tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+    if (!tree->tree1d) error = 83; /*alloc fail*/
+
+    if (!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+        || !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+        error = 83; /*alloc fail*/
+
+    if (!error)
+    {
+        /*step 1: count number of instances of each code length*/
+        for (bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+        /*step 2: generate the nextcode values*/
+        for (bits = 1; bits <= tree->maxbitlen; ++bits)
+        {
+            nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+        }
+        /*step 3: generate all the codes*/
+        for (n = 0; n != tree->numcodes; ++n)
+        {
+            if (tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+        }
+    }
+
+    uivector_cleanup(&blcount);
+    uivector_cleanup(&nextcode);
+
+    if (!error) return HuffmanTree_make2DTree(tree);
+    else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned i;
+    tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+    for (i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->maxbitlen = maxbitlen;
+    return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode
+{
+    int weight; /*the sum of all weights in this chain*/
+    unsigned index; /*index of this leaf node (called "count" in the paper)*/
+    struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+    int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists
+{
+    /*memory pool*/
+    unsigned memsize;
+    BPMNode* memory;
+    unsigned numfree;
+    unsigned nextfree;
+    BPMNode** freelist;
+    /*two heads of lookahead chains per list*/
+    unsigned listsize;
+    BPMNode** chains0;
+    BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail)
+{
+    unsigned i;
+    BPMNode* result;
+
+    /*memory full, so garbage collect*/
+    if (lists->nextfree >= lists->numfree)
+    {
+        /*mark only those that are in use*/
+        for (i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+        for (i = 0; i != lists->listsize; ++i)
+        {
+            BPMNode* node;
+            for (node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+            for (node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+        }
+        /*collect those that are free*/
+        lists->numfree = 0;
+        for (i = 0; i != lists->memsize; ++i)
+        {
+            if (!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+        }
+        lists->nextfree = 0;
+    }
+
+    result = lists->freelist[lists->nextfree++];
+    result->weight = weight;
+    result->index = index;
+    result->tail = tail;
+    return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num)
+{
+    BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+    size_t width, counter = 0;
+    for (width = 1; width < num; width *= 2)
+    {
+        BPMNode* a = (counter & 1) ? mem : leaves;
+        BPMNode* b = (counter & 1) ? leaves : mem;
+        size_t p;
+        for (p = 0; p < num; p += 2 * width)
+        {
+            size_t q = (p + width > num) ? num : (p + width);
+            size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+            size_t i = p, j = q, k;
+            for (k = p; k < r; k++)
+            {
+                if (i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+                else b[k] = a[j++];
+            }
+        }
+        counter++;
+    }
+    if (counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+    lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num)
+{
+    unsigned lastindex = lists->chains1[c]->index;
+
+    if (c == 0)
+    {
+        if (lastindex >= numpresent) return;
+        lists->chains0[c] = lists->chains1[c];
+        lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+    }
+    else
+    {
+        /*sum of the weights of the head nodes of the previous lookahead chains.*/
+        int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+        lists->chains0[c] = lists->chains1[c];
+        if (lastindex < numpresent && sum > leaves[lastindex].weight)
+        {
+            lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+            return;
+        }
+        lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+        /*in the end we are only interested in the chain of the last list, so no
+        need to recurse if we're at the last one (this gives measurable speedup)*/
+        if (num + 1 < (int)(2 * numpresent - 2))
+        {
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+        }
+    }
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    unsigned i;
+    size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+    BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+    if (numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+    if ((1u << maxbitlen) < numcodes) return 80; /*error: represent all symbols*/
+
+    leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+    if (!leaves) return 83; /*alloc fail*/
+
+    for (i = 0; i != numcodes; ++i)
+    {
+        if (frequencies[i] > 0)
+        {
+            leaves[numpresent].weight = (int)frequencies[i];
+            leaves[numpresent].index = i;
+            ++numpresent;
+        }
+    }
+
+    for (i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+    /*ensure at least two present symbols. There should be at least one symbol
+    according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+    make these work as well ensure there are at least two symbols. The
+    Package-Merge code below also doesn't work correctly if there's only one
+    symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+    if (numpresent == 0)
+    {
+        lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+    }
+    else if (numpresent == 1)
+    {
+        lengths[leaves[0].index] = 1;
+        lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+    }
+    else
+    {
+        BPMLists lists;
+        BPMNode* node;
+
+        bpmnode_sort(leaves, numpresent);
+
+        lists.listsize = maxbitlen;
+        lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+        lists.nextfree = 0;
+        lists.numfree = lists.memsize;
+        lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+        lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+        lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        if (!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            for (i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+            bpmnode_create(&lists, leaves[0].weight, 1, 0);
+            bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+            for (i = 0; i != lists.listsize; ++i)
+            {
+                lists.chains0[i] = &lists.memory[0];
+                lists.chains1[i] = &lists.memory[1];
+            }
+
+            /*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+            for (i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+            for (node = lists.chains1[maxbitlen - 1]; node; node = node->tail)
+            {
+                for (i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+            }
+        }
+
+        lodepng_free(lists.memory);
+        lodepng_free(lists.freelist);
+        lodepng_free(lists.chains0);
+        lodepng_free(lists.chains1);
+    }
+
+    lodepng_free(leaves);
+    return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+    size_t mincodes, size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    while (!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+    tree->maxbitlen = maxbitlen;
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+                                   /*initialize all lengths to 0*/
+    memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+    error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+    if (!error) error = HuffmanTree_makeFromLengths2(tree);
+    return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index)
+{
+    return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index)
+{
+    return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+    for (i = 0; i <= 143; ++i) bitlen[i] = 8;
+    for (i = 144; i <= 255; ++i) bitlen[i] = 9;
+    for (i = 256; i <= 279; ++i) bitlen[i] = 7;
+    for (i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*there are 32 distance codes, but 30-31 are unused*/
+    for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+    const HuffmanTree* codetree, size_t inbitlength)
+{
+    unsigned treepos = 0, ct;
+    for (;;)
+    {
+        if (*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+                                                       /*
+                                                       decode the symbol from the tree. The "readBitFromStream" code is inlined in
+                                                       the expression below because this is the biggest bottleneck while decoding
+                                                       */
+        ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+        ++(*bp);
+        if (ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+        else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+        if (treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+    }
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d)
+{
+    /*TODO: check for out of memory errors*/
+    generateFixedLitLenTree(tree_ll);
+    generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+    const unsigned char* in, size_t* bp, size_t inlength)
+{
+    /*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+    unsigned error = 0;
+    unsigned n, HLIT, HDIST, HCLEN, i;
+    size_t inbitlength = inlength * 8;
+
+    /*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+    unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+    unsigned* bitlen_d = 0; /*dist code lengths*/
+                            /*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+    unsigned* bitlen_cl = 0;
+    HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+    if ((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+                                                 /*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+    HLIT = readBitsFromStream(bp, in, 5) + 257;
+    /*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+    HDIST = readBitsFromStream(bp, in, 5) + 1;
+    /*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+    HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+    if ((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+    HuffmanTree_init(&tree_cl);
+
+    while (!error)
+    {
+        /*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+        bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+        if (!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+        for (i = 0; i != NUM_CODE_LENGTH_CODES; ++i)
+        {
+            if (i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+            else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+        }
+
+        error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+        if (error) break;
+
+        /*now we can use this tree to read the lengths for the tree that this function will return*/
+        bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+        bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+        if (!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+        for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+        /*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+        i = 0;
+        while (i < HLIT + HDIST)
+        {
+            unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+            if (code <= 15) /*a length code*/
+            {
+                if (i < HLIT) bitlen_ll[i] = code;
+                else bitlen_d[i - HLIT] = code;
+                ++i;
+            }
+            else if (code == 16) /*repeat previous*/
+            {
+                unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+                unsigned value; /*set value to the previous code*/
+
+                if (i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+                if ((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 2);
+
+                if (i < HLIT + 1) value = bitlen_ll[i - 1];
+                else value = bitlen_d[i - HLIT - 1];
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+                    if (i < HLIT) bitlen_ll[i] = value;
+                    else bitlen_d[i - HLIT] = value;
+                    ++i;
+                }
+            }
+            else if (code == 17) /*repeat "0" 3-10 times*/
+            {
+                unsigned replength = 3; /*read in the bits that indicate repeat length*/
+                if ((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 3);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else if (code == 18) /*repeat "0" 11-138 times*/
+            {
+                unsigned replength = 11; /*read in the bits that indicate repeat length*/
+                if ((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 7);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+            {
+                if (code == (unsigned)(-1))
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inbitlength ? 10 : 11;
+                }
+                else error = 16; /*unexisting code, this can never happen*/
+                break;
+            }
+        }
+        if (error) break;
+
+        if (bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+                                                  /*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+        error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+        if (error) break;
+        error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+        break; /*end of error-while*/
+    }
+
+    lodepng_free(bitlen_cl);
+    lodepng_free(bitlen_ll);
+    lodepng_free(bitlen_d);
+    HuffmanTree_cleanup(&tree_cl);
+
+    return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+    size_t* pos, size_t inlength, unsigned btype)
+{
+    unsigned error = 0;
+    HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+    HuffmanTree tree_d; /*the huffman tree for distance codes*/
+    size_t inbitlength = inlength * 8;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    if (btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+    else if (btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+    while (!error) /*decode all symbols until end reached, breaks at end code*/
+    {
+        /*code_ll is literal, length or end code*/
+        unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+        if (code_ll <= 255) /*literal symbol*/
+        {
+            /*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+            if (!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+            out->data[*pos] = (unsigned char)code_ll;
+            ++(*pos);
+        }
+        else if (code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/
+        {
+            unsigned code_d, distance;
+            unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+            size_t start, forward, backward, length;
+
+            /*part 1: get length base*/
+            length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+            /*part 2: get extra bits and add the value of that to length*/
+            numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+            if ((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            length += readBitsFromStream(bp, in, numextrabits_l);
+
+            /*part 3: get distance code*/
+            code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+            if (code_d > 29)
+            {
+                if (code_d == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inlength * 8 ? 10 : 11;
+                }
+                else error = 18; /*error: invalid distance code (30-31 are never used)*/
+                break;
+            }
+            distance = DISTANCEBASE[code_d];
+
+            /*part 4: get extra bits from distance*/
+            numextrabits_d = DISTANCEEXTRA[code_d];
+            if ((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            distance += readBitsFromStream(bp, in, numextrabits_d);
+
+            /*part 5: fill in all the out[n] values based on the length and dist*/
+            start = (*pos);
+            if (distance > start) ERROR_BREAK(52); /*too long backward distance*/
+            backward = start - distance;
+
+            if (!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+            if (distance < length) {
+                for (forward = 0; forward < length; ++forward)
+                {
+                    out->data[(*pos)++] = out->data[backward++];
+                }
+            }
+            else {
+                memcpy(out->data + *pos, out->data + backward, length);
+                *pos += length;
+            }
+        }
+        else if (code_ll == 256)
+        {
+            break; /*end code, break the loop*/
+        }
+        else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+        {
+            /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+            (10=no endcode, 11=wrong jump outside of tree)*/
+            error = ((*bp) > inlength * 8) ? 10 : 11;
+            break;
+        }
+    }
+
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength)
+{
+    size_t p;
+    unsigned LEN, NLEN, n, error = 0;
+
+    /*go to first boundary of byte*/
+    while (((*bp) & 0x7) != 0) ++(*bp);
+    p = (*bp) / 8; /*byte position*/
+
+                   /*read LEN (2 bytes) and NLEN (2 bytes)*/
+    if (p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+    LEN = in[p] + 256u * in[p + 1]; p += 2;
+    NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+    /*check if 16-bit NLEN is really the one's complement of LEN*/
+    if (LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+    if (!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+                                                        /*read the literal data: LEN bytes are now stored in the out buffer*/
+    if (p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+    for (n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+    (*bp) = p * 8;
+
+    return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    /*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+    size_t bp = 0;
+    unsigned BFINAL = 0;
+    size_t pos = 0; /*byte position in the out buffer*/
+    unsigned error = 0;
+
+    (void)settings;
+
+    while (!BFINAL)
+    {
+        unsigned BTYPE;
+        if (bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+        BFINAL = readBitFromStream(&bp, in);
+        BTYPE = 1u * readBitFromStream(&bp, in);
+        BTYPE += 2u * readBitFromStream(&bp, in);
+
+        if (BTYPE == 3) return 20; /*error: invalid BTYPE*/
+        else if (BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+        else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+        if (error) return error;
+    }
+
+    return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_inflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_inflate)
+    {
+        return settings->custom_inflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_inflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen)
+{
+    addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value)
+{
+    /*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+    size_t left = 1;
+    size_t right = array_size - 1;
+
+    while (left <= right) {
+        size_t mid = (left + right) >> 1;
+        if (array[mid] >= value) right = mid - 1;
+        else left = mid + 1;
+    }
+    if (left >= array_size || array[left] > value) left--;
+    return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance)
+{
+    /*values in encoded vector are those used by deflate:
+    0-255: literal bytes
+    256: end
+    257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+    286-287: invalid*/
+
+    unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+    unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+    unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+    unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+    uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+    uivector_push_back(values, extra_length);
+    uivector_push_back(values, dist_code);
+    uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash
+{
+    int* head; /*hash value to head circular pos - can be outdated if went around window*/
+               /*circular pos to prev circular pos*/
+    unsigned short* chain;
+    int* val; /*circular pos to hash value*/
+
+              /*TODO: do this not only for zeros but for any repeated byte. However for PNG
+              it's always going to be the zeros that dominate, so not important for PNG*/
+    int* headz; /*similar to head, but for chainz*/
+    unsigned short* chainz; /*those with same amount of zeros*/
+    unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize)
+{
+    unsigned i;
+    hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+    hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+    hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+    hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+    hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    if (!hash->head || !hash->chain || !hash->val || !hash->headz || !hash->chainz || !hash->zeros)
+    {
+        return 83; /*alloc fail*/
+    }
+
+    /*initialize hash table*/
+    for (i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->val[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+    for (i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+    return 0;
+}
+
+static void hash_cleanup(Hash* hash)
+{
+    lodepng_free(hash->head);
+    lodepng_free(hash->val);
+    lodepng_free(hash->chain);
+
+    lodepng_free(hash->zeros);
+    lodepng_free(hash->headz);
+    lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos)
+{
+    unsigned result = 0;
+    if (pos + 2 < size)
+    {
+        /*A simple shift and xor hash is used. Since the data of PNGs is dominated
+        by zeroes due to the filters, a better hash does not have a significant
+        effect on speed in traversing the chain, and causes more time spend on
+        calculating the hash.*/
+        result ^= (unsigned)(data[pos + 0] << 0u);
+        result ^= (unsigned)(data[pos + 1] << 4u);
+        result ^= (unsigned)(data[pos + 2] << 8u);
+    }
+    else {
+        size_t amount, i;
+        if (pos >= size) return 0;
+        amount = size - pos;
+        for (i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+    }
+    return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos)
+{
+    const unsigned char* start = data + pos;
+    const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+    if (end > data + size) end = data + size;
+    data = start;
+    while (data != end && *data == 0) ++data;
+    /*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+    return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros)
+{
+    hash->val[wpos] = (int)hashval;
+    if (hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+    hash->head[hashval] = wpos;
+
+    hash->zeros[wpos] = numzeros;
+    if (hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+    hash->headz[numzeros] = wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+    const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+    unsigned minmatch, unsigned nicematch, unsigned lazymatching)
+{
+    size_t pos;
+    unsigned i, error = 0;
+    /*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+    unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+    unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+    unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+    unsigned numzeros = 0;
+
+    unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+    unsigned length;
+    unsigned lazy = 0;
+    unsigned lazylength = 0, lazyoffset = 0;
+    unsigned hashval;
+    unsigned current_offset, current_length;
+    unsigned prev_offset;
+    const unsigned char *lastptr, *foreptr, *backptr;
+    unsigned hashpos;
+
+    if (windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+    if ((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+    if (nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+    for (pos = inpos; pos < insize; ++pos)
+    {
+        size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+        unsigned chainlength = 0;
+
+        hashval = getHash(in, insize, pos);
+
+        if (usezeros && hashval == 0)
+        {
+            if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+            else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+        }
+        else
+        {
+            numzeros = 0;
+        }
+
+        updateHashChain(hash, wpos, hashval, numzeros);
+
+        /*the length and offset found for the current position*/
+        length = 0;
+        offset = 0;
+
+        hashpos = hash->chain[wpos];
+
+        lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+        /*search for the longest string*/
+        prev_offset = 0;
+        for (;;)
+        {
+            if (chainlength++ >= maxchainlength) break;
+            current_offset = hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize;
+
+            if (current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+            prev_offset = current_offset;
+            if (current_offset > 0)
+            {
+                /*test the next characters*/
+                foreptr = &in[pos];
+                backptr = &in[pos - current_offset];
+
+                /*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+                if (numzeros >= 3)
+                {
+                    unsigned skip = hash->zeros[hashpos];
+                    if (skip > numzeros) skip = numzeros;
+                    backptr += skip;
+                    foreptr += skip;
+                }
+
+                while (foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/
+                {
+                    ++backptr;
+                    ++foreptr;
+                }
+                current_length = (unsigned)(foreptr - &in[pos]);
+
+                if (current_length > length)
+                {
+                    length = current_length; /*the longest length*/
+                    offset = current_offset; /*the offset that is related to this longest length*/
+                                             /*jump out once a length of max length is found (speed gain). This also jumps
+                                             out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+                    if (current_length >= nicematch) break;
+                }
+            }
+
+            if (hashpos == hash->chain[hashpos]) break;
+
+            if (numzeros >= 3 && length > numzeros)
+            {
+                hashpos = hash->chainz[hashpos];
+                if (hash->zeros[hashpos] != numzeros) break;
+            }
+            else
+            {
+                hashpos = hash->chain[hashpos];
+                /*outdated hash value, happens if particular value was not encountered in whole last window*/
+                if (hash->val[hashpos] != (int)hashval) break;
+            }
+        }
+
+        if (lazymatching)
+        {
+            if (!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH)
+            {
+                lazy = 1;
+                lazylength = length;
+                lazyoffset = offset;
+                continue; /*try the next byte*/
+            }
+            if (lazy)
+            {
+                lazy = 0;
+                if (pos == 0) ERROR_BREAK(81);
+                if (length > lazylength + 1)
+                {
+                    /*push the previous character as literal*/
+                    if (!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+                }
+                else
+                {
+                    length = lazylength;
+                    offset = lazyoffset;
+                    hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+                    hash->headz[numzeros] = -1; /*idem*/
+                    --pos;
+                }
+            }
+        }
+        if (length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+        /*encode it as length/distance pair or literal value*/
+        if (length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
+        {
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else if (length < minmatch || (length == 3 && offset > 4096))
+        {
+            /*compensate for the fact that longer offsets have more extra bits, a
+            length of only 3 may be not worth it then*/
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else
+        {
+            addLengthDistance(out, length, offset);
+            for (i = 1; i < length; ++i)
+            {
+                ++pos;
+                wpos = pos & (windowsize - 1);
+                hashval = getHash(in, insize, pos);
+                if (usezeros && hashval == 0)
+                {
+                    if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+                    else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+                }
+                else
+                {
+                    numzeros = 0;
+                }
+                updateHashChain(hash, wpos, hashval, numzeros);
+            }
+        }
+    } /*end of the loop through each character of input*/
+
+    return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize)
+{
+    /*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+    2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+    size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+    unsigned datapos = 0;
+    for (i = 0; i != numdeflateblocks; ++i)
+    {
+        unsigned BFINAL, BTYPE, LEN, NLEN;
+        unsigned char firstbyte;
+
+        BFINAL = (i == numdeflateblocks - 1);
+        BTYPE = 0;
+
+        firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+        ucvector_push_back(out, firstbyte);
+
+        LEN = 65535;
+        if (datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+        NLEN = 65535 - LEN;
+
+        ucvector_push_back(out, (unsigned char)(LEN & 255));
+        ucvector_push_back(out, (unsigned char)(LEN >> 8));
+        ucvector_push_back(out, (unsigned char)(NLEN & 255));
+        ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+        /*Decompressed data*/
+        for (j = 0; j < 65535 && datapos < datasize; ++j)
+        {
+            ucvector_push_back(out, data[datapos++]);
+        }
+    }
+
+    return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+    const HuffmanTree* tree_ll, const HuffmanTree* tree_d)
+{
+    size_t i = 0;
+    for (i = 0; i != lz77_encoded->size; ++i)
+    {
+        unsigned val = lz77_encoded->data[i];
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+        if (val > 256) /*for a length code, 3 more things have to be added*/
+        {
+            unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+            unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+            unsigned length_extra_bits = lz77_encoded->data[++i];
+
+            unsigned distance_code = lz77_encoded->data[++i];
+
+            unsigned distance_index = distance_code;
+            unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+            unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+            addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+                HuffmanTree_getLength(tree_d, distance_code));
+            addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+        }
+    }
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data, size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    unsigned error = 0;
+
+    /*
+    A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+    literal bytes and length/distance pairs. This is then huffman compressed with
+    two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+    another huffman tree is used for the dist values ("d"). These two trees are
+    stored using their code lengths, and to compress even more these code lengths
+    are also run-length encoded and huffman compressed. This gives a huffman tree
+    of code lengths "cl". The code lenghts used to describe this third tree are
+    the code length code lengths ("clcl").
+    */
+
+    /*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+    uivector lz77_encoded;
+    HuffmanTree tree_ll; /*tree for lit,len values*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+    HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+    uivector frequencies_ll; /*frequency of lit,len codes*/
+    uivector frequencies_d; /*frequency of dist codes*/
+    uivector frequencies_cl; /*frequency of code length codes*/
+    uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+    uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+                           /*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+                           (these are written as is in the file, it would be crazy to compress these using yet another huffman
+                           tree that needs to be represented by yet another set of code lengths)*/
+    uivector bitlen_cl;
+    size_t datasize = dataend - datapos;
+
+    /*
+    Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+    bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+    bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+    bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+    */
+
+    unsigned BFINAL = final;
+    size_t numcodes_ll, numcodes_d, i;
+    unsigned HLIT, HDIST, HCLEN;
+
+    uivector_init(&lz77_encoded);
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+    HuffmanTree_init(&tree_cl);
+    uivector_init(&frequencies_ll);
+    uivector_init(&frequencies_d);
+    uivector_init(&frequencies_cl);
+    uivector_init(&bitlen_lld);
+    uivector_init(&bitlen_lld_e);
+    uivector_init(&bitlen_cl);
+
+    /*This while loop never loops due to a break at the end, it is here to
+    allow breaking out of it to the cleanup phase on error conditions.*/
+    while (!error)
+    {
+        if (settings->use_lz77)
+        {
+            error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                settings->minmatch, settings->nicematch, settings->lazymatching);
+            if (error) break;
+        }
+        else
+        {
+            if (!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+            for (i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+        }
+
+        if (!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        if (!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+        /*Count the frequencies of lit, len and dist codes*/
+        for (i = 0; i != lz77_encoded.size; ++i)
+        {
+            unsigned symbol = lz77_encoded.data[i];
+            ++frequencies_ll.data[symbol];
+            if (symbol > 256)
+            {
+                unsigned dist = lz77_encoded.data[i + 2];
+                ++frequencies_d.data[dist];
+                i += 3;
+            }
+        }
+        frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+                                      /*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+        error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+        if (error) break;
+        /*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+        error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+        if (error) break;
+
+        numcodes_ll = tree_ll.numcodes; if (numcodes_ll > 286) numcodes_ll = 286;
+        numcodes_d = tree_d.numcodes; if (numcodes_d > 30) numcodes_d = 30;
+        /*store the code lengths of both generated trees in bitlen_lld*/
+        for (i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+        for (i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+        /*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+        17 (3-10 zeroes), 18 (11-138 zeroes)*/
+        for (i = 0; i != (unsigned)bitlen_lld.size; ++i)
+        {
+            unsigned j = 0; /*amount of repititions*/
+            while (i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+            if (bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/
+            {
+                ++j; /*include the first zero*/
+                if (j <= 10) /*repeat code 17 supports max 10 zeroes*/
+                {
+                    uivector_push_back(&bitlen_lld_e, 17);
+                    uivector_push_back(&bitlen_lld_e, j - 3);
+                }
+                else /*repeat code 18 supports max 138 zeroes*/
+                {
+                    if (j > 138) j = 138;
+                    uivector_push_back(&bitlen_lld_e, 18);
+                    uivector_push_back(&bitlen_lld_e, j - 11);
+                }
+                i += (j - 1);
+            }
+            else if (j >= 3) /*repeat code for value other than zero*/
+            {
+                size_t k;
+                unsigned num = j / 6, rest = j % 6;
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+                for (k = 0; k < num; ++k)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, 6 - 3);
+                }
+                if (rest >= 3)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, rest - 3);
+                }
+                else j -= rest;
+                i += j;
+            }
+            else /*too short to benefit from repeat code*/
+            {
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+            }
+        }
+
+        /*generate tree_cl, the huffmantree of huffmantrees*/
+
+        if (!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            ++frequencies_cl.data[bitlen_lld_e.data[i]];
+            /*after a repeat code come the bits that specify the number of repetitions,
+            those don't need to be in the frequencies_cl calculation*/
+            if (bitlen_lld_e.data[i] >= 16) ++i;
+        }
+
+        error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+            frequencies_cl.size, frequencies_cl.size, 7);
+        if (error) break;
+
+        if (!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != tree_cl.numcodes; ++i)
+        {
+            /*lenghts of code length tree is in the order as specified by deflate*/
+            bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+        }
+        while (bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4)
+        {
+            /*remove zeros at the end, but minimum size must be 4*/
+            if (!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        if (error) break;
+
+        /*
+        Write everything into the output
+
+        After the BFINAL and BTYPE, the dynamic block consists out of the following:
+        - 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+        - (HCLEN+4)*3 bits code lengths of code length alphabet
+        - HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - HDIST + 1 code lengths of distance alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - compressed data
+        - 256 (end code)
+        */
+
+        /*Write block type*/
+        addBitToStream(bp, out, BFINAL);
+        addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+        addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+                                    /*write the HLIT, HDIST and HCLEN values*/
+        HLIT = (unsigned)(numcodes_ll - 257);
+        HDIST = (unsigned)(numcodes_d - 1);
+        HCLEN = (unsigned)bitlen_cl.size - 4;
+        /*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+        while (!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+        addBitsToStream(bp, out, HLIT, 5);
+        addBitsToStream(bp, out, HDIST, 5);
+        addBitsToStream(bp, out, HCLEN, 4);
+
+        /*write the code lenghts of the code length alphabet*/
+        for (i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+        /*write the lenghts of the lit/len AND the dist alphabet*/
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+                HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+            /*extra bits of repeat codes*/
+            if (bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+            else if (bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+            else if (bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+        }
+
+        /*write the compressed data symbols*/
+        writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        /*error: the length of the end code 256 must be larger than 0*/
+        if (HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+        /*write the end code*/
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+        break; /*end of error-while*/
+    }
+
+    /*cleanup*/
+    uivector_cleanup(&lz77_encoded);
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+    HuffmanTree_cleanup(&tree_cl);
+    uivector_cleanup(&frequencies_ll);
+    uivector_cleanup(&frequencies_d);
+    uivector_cleanup(&frequencies_cl);
+    uivector_cleanup(&bitlen_lld_e);
+    uivector_cleanup(&bitlen_lld);
+    uivector_cleanup(&bitlen_cl);
+
+    return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data,
+    size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    HuffmanTree tree_ll; /*tree for literal values and length codes*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+
+    unsigned BFINAL = final;
+    unsigned error = 0;
+    size_t i;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    generateFixedLitLenTree(&tree_ll);
+    generateFixedDistanceTree(&tree_d);
+
+    addBitToStream(bp, out, BFINAL);
+    addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+    addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+    if (settings->use_lz77) /*LZ77 encoded*/
+    {
+        uivector lz77_encoded;
+        uivector_init(&lz77_encoded);
+        error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+            settings->minmatch, settings->nicematch, settings->lazymatching);
+        if (!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        uivector_cleanup(&lz77_encoded);
+    }
+    else /*no LZ77, but still will be Huffman compressed*/
+    {
+        for (i = datapos; i < dataend; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+        }
+    }
+    /*add END code*/
+    if (!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+    /*cleanup*/
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error = 0;
+    size_t i, blocksize, numdeflateblocks;
+    size_t bp = 0; /*the bit pointer*/
+    Hash hash;
+
+    if (settings->btype > 2) return 61;
+    else if (settings->btype == 0) return deflateNoCompression(out, in, insize);
+    else if (settings->btype == 1) blocksize = insize;
+    else /*if(settings->btype == 2)*/
+    {
+        /*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+        blocksize = insize / 8 + 8;
+        if (blocksize < 65536) blocksize = 65536;
+        if (blocksize > 262144) blocksize = 262144;
+    }
+
+    numdeflateblocks = (insize + blocksize - 1) / blocksize;
+    if (numdeflateblocks == 0) numdeflateblocks = 1;
+
+    error = hash_init(&hash, settings->windowsize);
+    if (error) return error;
+
+    for (i = 0; i != numdeflateblocks && !error; ++i)
+    {
+        unsigned final = (i == numdeflateblocks - 1);
+        size_t start = i * blocksize;
+        size_t end = start + blocksize;
+        if (end > insize) end = insize;
+
+        if (settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+        else if (settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+    }
+
+    hash_cleanup(&hash);
+
+    return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_deflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_deflate)
+    {
+        return settings->custom_deflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_deflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len)
+{
+    unsigned s1 = adler & 0xffff;
+    unsigned s2 = (adler >> 16) & 0xffff;
+
+    while (len > 0)
+    {
+        /*at least 5550 sums can be done before the sums overflow, saving a lot of module divisions*/
+        unsigned amount = len > 5550 ? 5550 : len;
+        len -= amount;
+        while (amount > 0)
+        {
+            s1 += (*data++);
+            s2 += s1;
+            --amount;
+        }
+        s1 %= 65521;
+        s2 %= 65521;
+    }
+
+    return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len)
+{
+    return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    unsigned error = 0;
+    unsigned CM, CINFO, FDICT;
+
+    if (insize < 2) return 53; /*error, size of zlib data too small*/
+                               /*read information from zlib header*/
+    if ((in[0] * 256 + in[1]) % 31 != 0)
+    {
+        /*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+        return 24;
+    }
+
+    CM = in[0] & 15;
+    CINFO = (in[0] >> 4) & 15;
+    /*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+    FDICT = (in[1] >> 5) & 1;
+    /*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+    if (CM != 8 || CINFO > 7)
+    {
+        /*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+        return 25;
+    }
+    if (FDICT != 0)
+    {
+        /*error: the specification of PNG says about the zlib stream:
+        "The additional flags shall not specify a preset dictionary."*/
+        return 26;
+    }
+
+    error = inflate(out, outsize, in + 2, insize - 2, settings);
+    if (error) return error;
+
+    if (!settings->ignore_adler32)
+    {
+        unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+        unsigned checksum = adler32(*out, (unsigned)(*outsize));
+        if (checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+    }
+
+    return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    /*initially, *out must be NULL and outsize 0, if you just give some random *out
+    that's pointing to a non allocated buffer, this'll crash*/
+    ucvector outv;
+    size_t i;
+    unsigned error;
+    unsigned char* deflatedata = 0;
+    size_t deflatesize = 0;
+
+    /*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+    unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+    unsigned FLEVEL = 0;
+    unsigned FDICT = 0;
+    unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+    unsigned FCHECK = 31 - CMFFLG % 31;
+    CMFFLG += FCHECK;
+
+    /*ucvector-controlled version of the output buffer, for dynamic array*/
+    ucvector_init_buffer(&outv, *out, *outsize);
+
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+    error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+    if (!error)
+    {
+        unsigned ADLER32 = adler32(in, (unsigned)insize);
+        for (i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+        lodepng_free(deflatedata);
+        lodepng_add32bitInt(&outv, ADLER32);
+    }
+
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_compress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings)
+{
+    /*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+    settings->btype = 2;
+    settings->use_lz77 = 1;
+    settings->windowsize = DEFAULT_WINDOWSIZE;
+    settings->minmatch = 3;
+    settings->nicematch = 128;
+    settings->lazymatching = 1;
+
+    settings->custom_zlib = 0;
+    settings->custom_deflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = { 2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0 };
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings)
+{
+    settings->ignore_adler32 = 0;
+
+    settings->custom_zlib = 0;
+    settings->custom_inflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = { 0, 0, 0, 0 };
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+    0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+    249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+    498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+    325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+    997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+    901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+    651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+    671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+    1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+    2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+    1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+    1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+    1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+    1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+    1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+    1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+    3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+    3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+    4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+    4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+    3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+    3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+    3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+    3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+    2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+    2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+    2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+    2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+    2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+    2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+    3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+    3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length)
+{
+    unsigned r = 0xffffffffu;
+    size_t i;
+    for (i = 0; i < length; ++i)
+    {
+        r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+    }
+    return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0;
+    size_t i;
+    for (i = 0; i < nbits; ++i)
+    {
+        result <<= 1;
+        result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+    }
+    return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream must be 0 for this to work*/
+    if (bit)
+    {
+        /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+        bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+    }
+    ++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream may be 0 or 1 for this to work*/
+    if (bit == 0) bitstream[(*bitpointer) >> 3] &= (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+    else         bitstream[(*bitpointer) >> 3] |= (1 << (7 - ((*bitpointer) & 0x7)));
+    ++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk)
+{
+    return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk)
+{
+    unsigned i;
+    for (i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+    type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type)
+{
+    if (strlen(type) != 4) return 0;
+    return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk)
+{
+    return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk)
+{
+    return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk)
+{
+    return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+    /*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+    unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+    if (CRC != checksum) return 1;
+    else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+    lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk)
+{
+    unsigned i;
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    unsigned char *chunk_start, *new_buffer;
+    size_t new_length = (*outlength) + total_chunk_length;
+    if (new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk_start = &(*out)[new_length - total_chunk_length];
+
+    for (i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+    return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data)
+{
+    unsigned i;
+    unsigned char *chunk, *new_buffer;
+    size_t new_length = (*outlength) + length + 12;
+    if (new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk = &(*out)[(*outlength) - length - 12];
+
+    /*1: length*/
+    lodepng_set32bitInt(chunk, (unsigned)length);
+
+    /*2: chunk name (4 letters)*/
+    chunk[4] = (unsigned char)type[0];
+    chunk[5] = (unsigned char)type[1];
+    chunk[6] = (unsigned char)type[2];
+    chunk[7] = (unsigned char)type[3];
+
+    /*3: the data*/
+    for (i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+    /*4: CRC (of the chunkname characters and the data)*/
+    lodepng_chunk_generate_crc(chunk);
+
+    return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/
+{
+    switch (colortype)
+    {
+    case 0: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*grey*/
+    case 2: if (!(bd == 8 || bd == 16)) return 37; break; /*RGB*/
+    case 3: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8)) return 37; break; /*palette*/
+    case 4: if (!(bd == 8 || bd == 16)) return 37; break; /*grey + alpha*/
+    case 6: if (!(bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+    default: return 31;
+    }
+    return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype)
+{
+    switch (colortype)
+    {
+    case 0: return 1; /*grey*/
+    case 2: return 3; /*RGB*/
+    case 3: return 1; /*palette*/
+    case 4: return 2; /*grey + alpha*/
+    case 6: return 4; /*RGBA*/
+    }
+    return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*bits per pixel is amount of channels * bits per channel*/
+    return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info)
+{
+    info->key_defined = 0;
+    info->key_r = info->key_g = info->key_b = 0;
+    info->colortype = LCT_RGBA;
+    info->bitdepth = 8;
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info)
+{
+    lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source)
+{
+    size_t i;
+    lodepng_color_mode_cleanup(dest);
+    *dest = *source;
+    if (source->palette)
+    {
+        dest->palette = (unsigned char*)lodepng_malloc(1024);
+        if (!dest->palette && source->palettesize) return 83; /*alloc fail*/
+        for (i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+    }
+    return 0;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b)
+{
+    size_t i;
+    if (a->colortype != b->colortype) return 0;
+    if (a->bitdepth != b->bitdepth) return 0;
+    if (a->key_defined != b->key_defined) return 0;
+    if (a->key_defined)
+    {
+        if (a->key_r != b->key_r) return 0;
+        if (a->key_g != b->key_g) return 0;
+        if (a->key_b != b->key_b) return 0;
+    }
+    /*if one of the palette sizes is 0, then we consider it to be the same as the
+    other: it means that e.g. the palette was not given by the user and should be
+    considered the same as the palette inside the PNG.*/
+    if (1/*a->palettesize != 0 && b->palettesize != 0*/) {
+        if (a->palettesize != b->palettesize) return 0;
+        for (i = 0; i != a->palettesize * 4; ++i)
+        {
+            if (a->palette[i] != b->palette[i]) return 0;
+        }
+    }
+    return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info)
+{
+    if (info->palette) lodepng_free(info->palette);
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    unsigned char* data;
+    /*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+    the max of 256 colors, it'll have the exact alloc size*/
+    if (!info->palette) /*allocate palette if empty*/
+    {
+        /*room for 256 colors with 4 bytes each*/
+        data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+        if (!data) return 83; /*alloc fail*/
+        else info->palette = data;
+    }
+    info->palette[4 * info->palettesize + 0] = r;
+    info->palette[4 * info->palettesize + 1] = g;
+    info->palette[4 * info->palettesize + 2] = b;
+    info->palette[4 * info->palettesize + 3] = a;
+    ++info->palettesize;
+    return 0;
+}
+
+unsigned lodepng_get_bpp(const LodePNGColorMode* info)
+{
+    /*calculate bits per pixel out of colortype and bitdepth*/
+    return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info)
+{
+    return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info)
+{
+    return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info)
+{
+    size_t i;
+    for (i = 0; i != info->palettesize; ++i)
+    {
+        if (info->palette[i * 4 + 3] < 255) return 1;
+    }
+    return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info)
+{
+    return info->key_defined
+        || lodepng_is_alpha_type(info)
+        || lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer*/
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t line = ((w / 8) * bpp) + ((w & 7) * bpp + 7) / 8;
+    return h * line;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src)
+{
+    unsigned i;
+
+    LodePNGUnknownChunks_cleanup(dest);
+
+    for (i = 0; i != 3; ++i)
+    {
+        size_t j;
+        dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+        dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+        if (!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+        for (j = 0; j < src->unknown_chunks_size[i]; ++j)
+        {
+            dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+        }
+    }
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info)
+{
+    info->text_num = 0;
+    info->text_keys = NULL;
+    info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->text_num; ++i)
+    {
+        string_cleanup(&info->text_keys[i]);
+        string_cleanup(&info->text_strings[i]);
+    }
+    lodepng_free(info->text_keys);
+    lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->text_keys = 0;
+    dest->text_strings = 0;
+    dest->text_num = 0;
+    for (i = 0; i != source->text_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info)
+{
+    LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+    if (!new_keys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->text_num;
+    info->text_keys = new_keys;
+    info->text_strings = new_strings;
+
+    string_init(&info->text_keys[info->text_num - 1]);
+    string_set(&info->text_keys[info->text_num - 1], key);
+
+    string_init(&info->text_strings[info->text_num - 1]);
+    string_set(&info->text_strings[info->text_num - 1], str);
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info)
+{
+    info->itext_num = 0;
+    info->itext_keys = NULL;
+    info->itext_langtags = NULL;
+    info->itext_transkeys = NULL;
+    info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->itext_num; ++i)
+    {
+        string_cleanup(&info->itext_keys[i]);
+        string_cleanup(&info->itext_langtags[i]);
+        string_cleanup(&info->itext_transkeys[i]);
+        string_cleanup(&info->itext_strings[i]);
+    }
+    lodepng_free(info->itext_keys);
+    lodepng_free(info->itext_langtags);
+    lodepng_free(info->itext_transkeys);
+    lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->itext_keys = 0;
+    dest->itext_langtags = 0;
+    dest->itext_transkeys = 0;
+    dest->itext_strings = 0;
+    dest->itext_num = 0;
+    for (i = 0; i != source->itext_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+            source->itext_transkeys[i], source->itext_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info)
+{
+    LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+    char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+    if (!new_keys || !new_langtags || !new_transkeys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_langtags);
+        lodepng_free(new_transkeys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->itext_num;
+    info->itext_keys = new_keys;
+    info->itext_langtags = new_langtags;
+    info->itext_transkeys = new_transkeys;
+    info->itext_strings = new_strings;
+
+    string_init(&info->itext_keys[info->itext_num - 1]);
+    string_set(&info->itext_keys[info->itext_num - 1], key);
+
+    string_init(&info->itext_langtags[info->itext_num - 1]);
+    string_set(&info->itext_langtags[info->itext_num - 1], langtag);
+
+    string_init(&info->itext_transkeys[info->itext_num - 1]);
+    string_set(&info->itext_transkeys[info->itext_num - 1], transkey);
+
+    string_init(&info->itext_strings[info->itext_num - 1]);
+    string_set(&info->itext_strings[info->itext_num - 1], str);
+
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info)
+{
+    lodepng_color_mode_init(&info->color);
+    info->interlace_method = 0;
+    info->compression_method = 0;
+    info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    info->background_defined = 0;
+    info->background_r = info->background_g = info->background_b = 0;
+
+    LodePNGText_init(info);
+    LodePNGIText_init(info);
+
+    info->time_defined = 0;
+    info->phys_defined = 0;
+
+    LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info)
+{
+    lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    LodePNGText_cleanup(info);
+    LodePNGIText_cleanup(info);
+
+    LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    lodepng_info_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->color);
+    CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+    CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+
+    LodePNGUnknownChunks_init(dest);
+    CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    return 0;
+}
+
+void lodepng_info_swap(LodePNGInfo* a, LodePNGInfo* b)
+{
+    LodePNGInfo temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in)
+{
+    unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+                                                    /*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+    unsigned p = index & m;
+    in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+    in = in << (bits * (m - p));
+    if (p == 0) out[index * bits / 8] = in;
+    else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree
+{
+    ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+    int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i) tree->children[i] = 0;
+    tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i)
+    {
+        if (tree->children[i])
+        {
+            color_tree_cleanup(tree->children[i]);
+            lodepng_free(tree->children[i]);
+        }
+    }
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    int bit = 0;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i]) return -1;
+        else tree = tree->children[i];
+    }
+    return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index)
+{
+    int bit;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i])
+        {
+            tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+            color_tree_init(tree->children[i]);
+        }
+        tree = tree->children[i];
+    }
+    tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8) out[i] = grey;
+        else if (mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = grey;
+        else
+        {
+            /*take the most significant bits of grey*/
+            grey = (grey >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+            addColorBits(out, i, mode->bitdepth, grey);
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 3 + 0] = r;
+            out[i * 3 + 1] = g;
+            out[i * 3 + 2] = b;
+        }
+        else
+        {
+            out[i * 6 + 0] = out[i * 6 + 1] = r;
+            out[i * 6 + 2] = out[i * 6 + 3] = g;
+            out[i * 6 + 4] = out[i * 6 + 5] = b;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        int index = color_tree_get(tree, r, g, b, a);
+        if (index < 0) return 82; /*color not in palette*/
+        if (mode->bitdepth == 8) out[i] = index;
+        else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8)
+        {
+            out[i * 2 + 0] = grey;
+            out[i * 2 + 1] = a;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            out[i * 4 + 0] = out[i * 4 + 1] = grey;
+            out[i * 4 + 2] = out[i * 4 + 3] = a;
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 4 + 0] = r;
+            out[i * 4 + 1] = g;
+            out[i * 4 + 2] = b;
+            out[i * 4 + 3] = a;
+        }
+        else
+        {
+            out[i * 8 + 0] = out[i * 8 + 1] = r;
+            out[i * 8 + 2] = out[i * 8 + 3] = g;
+            out[i * 8 + 4] = out[i * 8 + 5] = b;
+            out[i * 8 + 6] = out[i * 8 + 7] = a;
+        }
+    }
+
+    return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode,
+    unsigned short r, unsigned short g, unsigned short b, unsigned short a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 2 + 0] = (grey >> 8) & 255;
+        out[i * 2 + 1] = grey & 255;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        out[i * 6 + 0] = (r >> 8) & 255;
+        out[i * 6 + 1] = r & 255;
+        out[i * 6 + 2] = (g >> 8) & 255;
+        out[i * 6 + 3] = g & 255;
+        out[i * 6 + 4] = (b >> 8) & 255;
+        out[i * 6 + 5] = b & 255;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 4 + 0] = (grey >> 8) & 255;
+        out[i * 4 + 1] = grey & 255;
+        out[i * 4 + 2] = (a >> 8) & 255;
+        out[i * 4 + 3] = a & 255;
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        out[i * 8 + 0] = (r >> 8) & 255;
+        out[i * 8 + 1] = r & 255;
+        out[i * 8 + 2] = (g >> 8) & 255;
+        out[i * 8 + 3] = g & 255;
+        out[i * 8 + 4] = (b >> 8) & 255;
+        out[i * 8 + 5] = b & 255;
+        out[i * 8 + 6] = (a >> 8) & 255;
+        out[i * 8 + 7] = a & 255;
+    }
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+    unsigned char* b, unsigned char* a,
+    const unsigned char* in, size_t i,
+    const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i];
+            if (mode->key_defined && *r == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = i * mode->bitdepth;
+            unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+            *r = *g = *b = (value * 255) / highest;
+            if (mode->key_defined && value == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+            if (mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            *r = in[i * 6 + 0];
+            *g = in[i * 6 + 2];
+            *b = in[i * 6 + 4];
+            if (mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        if (mode->bitdepth == 8) index = in[i];
+        else
+        {
+            size_t j = i * mode->bitdepth;
+            index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+        }
+
+        if (index >= mode->palettesize)
+        {
+            /*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+            Done here too, slightly faster due to no error handling needed.*/
+            *r = *g = *b = 0;
+            *a = 255;
+        }
+        else
+        {
+            *r = mode->palette[index * 4 + 0];
+            *g = mode->palette[index * 4 + 1];
+            *b = mode->palette[index * 4 + 2];
+            *a = mode->palette[index * 4 + 3];
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            *a = in[i * 2 + 1];
+        }
+        else
+        {
+            *r = *g = *b = in[i * 4 + 0];
+            *a = in[i * 4 + 2];
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 4 + 0];
+            *g = in[i * 4 + 1];
+            *b = in[i * 4 + 2];
+            *a = in[i * 4 + 3];
+        }
+        else
+        {
+            *r = in[i * 8 + 0];
+            *g = in[i * 8 + 2];
+            *b = in[i * 8 + 4];
+            *a = in[i * 8 + 6];
+        }
+    }
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+    unsigned has_alpha, const unsigned char* in,
+    const LodePNGColorMode* mode)
+{
+    unsigned num_channels = has_alpha ? 4 : 3;
+    size_t i;
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i];
+                if (has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+            }
+        }
+        else if (mode->bitdepth == 16)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+                if (has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+            }
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = 0;
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+                buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+                if (has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 3 + 0];
+                buffer[1] = in[i * 3 + 1];
+                buffer[2] = in[i * 3 + 2];
+                if (has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+                    && buffer[1] == mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 6 + 0];
+                buffer[1] = in[i * 6 + 2];
+                buffer[2] = in[i * 6 + 4];
+                if (has_alpha) buffer[3] = mode->key_defined
+                    && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                    && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                    && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        size_t j = 0;
+        for (i = 0; i != numpixels; ++i, buffer += num_channels)
+        {
+            if (mode->bitdepth == 8) index = in[i];
+            else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+            if (index >= mode->palettesize)
+            {
+                /*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+                Done here too, slightly faster due to no error handling needed.*/
+                buffer[0] = buffer[1] = buffer[2] = 0;
+                if (has_alpha) buffer[3] = 255;
+            }
+            else
+            {
+                buffer[0] = mode->palette[index * 4 + 0];
+                buffer[1] = mode->palette[index * 4 + 1];
+                buffer[2] = mode->palette[index * 4 + 2];
+                if (has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+                if (has_alpha) buffer[3] = in[i * 2 + 1];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+                if (has_alpha) buffer[3] = in[i * 4 + 2];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 4 + 0];
+                buffer[1] = in[i * 4 + 1];
+                buffer[2] = in[i * 4 + 2];
+                if (has_alpha) buffer[3] = in[i * 4 + 3];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 8 + 0];
+                buffer[1] = in[i * 8 + 2];
+                buffer[2] = in[i * 8 + 4];
+                if (has_alpha) buffer[3] = in[i * 8 + 6];
+            }
+        }
+    }
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+    const unsigned char* in, size_t i, const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        *r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+        if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        *r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+        *g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+        *b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+        if (mode->key_defined
+            && 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+            && 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+            && 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        *r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+        *a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        *r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+        *g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+        *b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+        *a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+    }
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h)
+{
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+    unsigned error = 0;
+
+    if (lodepng_color_mode_equal(mode_out, mode_in))
+    {
+        size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+        for (i = 0; i != numbytes; ++i) out[i] = in[i];
+        return 0;
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        size_t palettesize = mode_out->palettesize;
+        const unsigned char* palette = mode_out->palette;
+        size_t palsize = 1u << mode_out->bitdepth;
+        /*if the user specified output palette but did not give the values, assume
+        they want the values of the input color type (assuming that one is palette).
+        Note that we never create a new palette ourselves.*/
+        if (palettesize == 0)
+        {
+            palettesize = mode_in->palettesize;
+            palette = mode_in->palette;
+        }
+        if (palettesize < palsize) palsize = palettesize;
+        color_tree_init(&tree);
+        for (i = 0; i != palsize; ++i)
+        {
+            const unsigned char* p = &palette[i * 4];
+            color_tree_add(&tree, p[0], p[1], p[2], p[3], i);
+        }
+    }
+
+    if (mode_in->bitdepth == 16 && mode_out->bitdepth == 16)
+    {
+        for (i = 0; i != numpixels; ++i)
+        {
+            unsigned short r = 0, g = 0, b = 0, a = 0;
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+            rgba16ToPixel(out, i, mode_out, r, g, b, a);
+        }
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA)
+    {
+        getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB)
+    {
+        getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+    }
+    else
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+            error = rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a);
+            if (error) break;
+        }
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        color_tree_cleanup(&tree);
+    }
+
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile)
+{
+    profile->colored = 0;
+    profile->key = 0;
+    profile->key_r = profile->key_g = profile->key_b = 0;
+    profile->alpha = 0;
+    profile->numcolors = 0;
+    profile->bits = 1;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p)
+{
+std::cout << "colored: " << (int)p->colored << ", ";
+std::cout << "key: " << (int)p->key << ", ";
+std::cout << "key_r: " << (int)p->key_r << ", ";
+std::cout << "key_g: " << (int)p->key_g << ", ";
+std::cout << "key_b: " << (int)p->key_b << ", ";
+std::cout << "alpha: " << (int)p->alpha << ", ";
+std::cout << "numcolors: " << (int)p->numcolors << ", ";
+std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value)
+{
+    if (value == 0 || value == 255) return 1;
+    /*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+    if (value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+    return 8;
+}
+
+/*profile must already have been inited with mode.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* mode)
+{
+    unsigned error = 0;
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+
+    unsigned colored_done = lodepng_is_greyscale_type(mode) ? 1 : 0;
+    unsigned alpha_done = lodepng_can_have_alpha(mode) ? 0 : 1;
+    unsigned numcolors_done = 0;
+    unsigned bpp = lodepng_get_bpp(mode);
+    unsigned bits_done = bpp == 1 ? 1 : 0;
+    unsigned maxnumcolors = 257;
+    unsigned sixteen = 0;
+    if (bpp <= 8) maxnumcolors = bpp == 1 ? 2 : (bpp == 2 ? 4 : (bpp == 4 ? 16 : 256));
+
+    color_tree_init(&tree);
+
+    /*Check if the 16-bit input is truly 16-bit*/
+    if (mode->bitdepth == 16)
+    {
+        unsigned short r, g, b, a;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+            if ((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+                (b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/
+            {
+                sixteen = 1;
+                break;
+            }
+        }
+    }
+
+    if (sixteen)
+    {
+        unsigned short r = 0, g = 0, b = 0, a = 0;
+        profile->bits = 16;
+        bits_done = numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 65535 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 65535 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+        }
+    }
+    else /* < 16-bit */
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+
+            if (!bits_done && profile->bits < 8)
+            {
+                /*only r is checked, < 8 bits is only relevant for greyscale*/
+                unsigned bits = getValueRequiredBits(r);
+                if (bits > profile->bits) profile->bits = bits;
+            }
+            bits_done = (profile->bits >= bpp);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+                if (profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 255 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 255 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+
+            if (!numcolors_done)
+            {
+                if (!color_tree_has(&tree, r, g, b, a))
+                {
+                    color_tree_add(&tree, r, g, b, a, profile->numcolors);
+                    if (profile->numcolors < 256)
+                    {
+                        unsigned char* p = profile->palette;
+                        unsigned n = profile->numcolors;
+                        p[n * 4 + 0] = r;
+                        p[n * 4 + 1] = g;
+                        p[n * 4 + 2] = b;
+                        p[n * 4 + 3] = a;
+                    }
+                    ++profile->numcolors;
+                    numcolors_done = profile->numcolors >= maxnumcolors;
+                }
+            }
+
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+        }
+
+        /*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+        profile->key_r += (profile->key_r << 8);
+        profile->key_g += (profile->key_g << 8);
+        profile->key_b += (profile->key_b << 8);
+    }
+
+    color_tree_cleanup(&tree);
+    return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. grey if there are only greyscale pixels, palette if there
+are less than 256 colors, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in)
+{
+    LodePNGColorProfile prof;
+    unsigned error = 0;
+    unsigned i, n, palettebits, palette_ok;
+
+    lodepng_color_profile_init(&prof);
+    error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+    if (error) return error;
+    mode_out->key_defined = 0;
+
+    if (prof.key && w * h <= 16)
+    {
+        prof.alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+        prof.key = 0;
+        if (prof.bits < 8) prof.bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+    }
+    n = prof.numcolors;
+    palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+    palette_ok = n <= 256 && prof.bits <= 8;
+    if (w * h < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+    if (!prof.colored && prof.bits <= palettebits) palette_ok = 0; /*grey is less overhead*/
+
+    if (palette_ok)
+    {
+        unsigned char* p = prof.palette;
+        lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+        for (i = 0; i != prof.numcolors; ++i)
+        {
+            error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+            if (error) break;
+        }
+
+        mode_out->colortype = LCT_PALETTE;
+        mode_out->bitdepth = palettebits;
+
+        if (mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+            && mode_in->bitdepth == mode_out->bitdepth)
+        {
+            /*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+            lodepng_color_mode_cleanup(mode_out);
+            lodepng_color_mode_copy(mode_out, mode_in);
+        }
+    }
+    else /*8-bit or 16-bit per channel*/
+    {
+        mode_out->bitdepth = prof.bits;
+        mode_out->colortype = prof.alpha ? (prof.colored ? LCT_RGBA : LCT_GREY_ALPHA)
+            : (prof.colored ? LCT_RGB : LCT_GREY);
+
+        if (prof.key)
+        {
+            unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+            mode_out->key_r = prof.key_r & mask;
+            mode_out->key_g = prof.key_g & mask;
+            mode_out->key_b = prof.key_b & mask;
+            mode_out->key_defined = 1;
+        }
+    }
+
+    return error;
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c)
+{
+    short pa = abs(b - c);
+    short pb = abs(a - c);
+    short pc = abs(a + b - c - c);
+
+    if (pc < pa && pc < pb) return (unsigned char)c;
+    else if (pb < pa) return (unsigned char)b;
+    else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+                                                             /*
+                                                             Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+                                                             passw: output containing the width of the 7 passes
+                                                             passh: output containing the height of the 7 passes
+                                                             filter_passstart: output containing the index of the start and end of each
+                                                             reduced image with filter bytes
+                                                             padded_passstart output containing the index of the start and end of each
+                                                             reduced image when without filter bytes but with padded scanlines
+                                                             passstart: output containing the index of the start and end of each reduced
+                                                             image without padding between scanlines, but still padding between the images
+                                                             w, h: width and height of non-interlaced image
+                                                             bpp: bits per pixel
+                                                             "padded" is only relevant if bpp is less than 8 and a scanline or image does not
+                                                             end at a full byte
+                                                             */
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+    size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp)
+{
+    /*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+    unsigned i;
+
+    /*calculate width and height in pixels of each pass*/
+    for (i = 0; i != 7; ++i)
+    {
+        passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+        passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+        if (passw[i] == 0) passh[i] = 0;
+        if (passh[i] == 0) passw[i] = 0;
+    }
+
+    filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+    for (i = 0; i != 7; ++i)
+    {
+        /*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+        filter_passstart[i + 1] = filter_passstart[i]
+            + ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+        /*bits padded if needed to fill full byte at end of each scanline*/
+        padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+        /*only padded at end of reduced image*/
+        passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+    }
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    LodePNGInfo* info = &state->info_png;
+    if (insize == 0 || in == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+    }
+    if (insize < 33)
+    {
+        CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+    }
+
+    /*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+    lodepng_info_cleanup(info);
+    lodepng_info_init(info);
+
+    if (in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+        || in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10)
+    {
+        CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+    }
+    if (lodepng_chunk_length(in + 8) != 13)
+    {
+        CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+    }
+    if (!lodepng_chunk_type_equals(in + 8, "IHDR"))
+    {
+        CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+    }
+
+    /*read the values given in the header*/
+    *w = lodepng_read32bitInt(&in[16]);
+    *h = lodepng_read32bitInt(&in[20]);
+    info->color.bitdepth = in[24];
+    info->color.colortype = (LodePNGColorType)in[25];
+    info->compression_method = in[26];
+    info->filter_method = in[27];
+    info->interlace_method = in[28];
+
+    if (*w == 0 || *h == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 93);
+    }
+
+    if (!state->decoder.ignore_crc)
+    {
+        unsigned CRC = lodepng_read32bitInt(&in[29]);
+        unsigned checksum = lodepng_crc32(&in[12], 17);
+        if (CRC != checksum)
+        {
+            CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+        }
+    }
+
+    /*error: only compression method 0 is allowed in the specification*/
+    if (info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+    /*error: only filter method 0 is allowed in the specification*/
+    if (info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+    /*error: only interlace methods 0 and 1 exist in the specification*/
+    if (info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+    state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+    return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+    size_t bytewidth, unsigned char filterType, size_t length)
+{
+    /*
+    For PNG filter method 0
+    unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+    the filter works byte per byte (bytewidth = 1)
+    precon is the previous unfiltered scanline, recon the result, scanline the current one
+    the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+    recon and scanline MAY be the same memory address! precon must be disjoint.
+    */
+
+    size_t i;
+    switch (filterType)
+    {
+    case 0:
+        for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        break;
+    case 1:
+        for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+        break;
+    case 2:
+        if (precon)
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        }
+        break;
+    case 3:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+        }
+        break;
+    case 4:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = scanline[i];
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                /*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+                recon[i] = (scanline[i] + recon[i - bytewidth]);
+            }
+        }
+        break;
+    default: return 36; /*error: unexisting filter type given*/
+    }
+    return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    /*
+    For PNG filter method 0
+    this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+    out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+    w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+    in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+    */
+
+    unsigned y;
+    unsigned char* prevline = 0;
+
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    size_t linebytes = (w * bpp + 7) / 8;
+
+    for (y = 0; y < h; ++y)
+    {
+        size_t outindex = linebytes * y;
+        size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+        unsigned char filterType = in[inindex];
+
+        CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+        prevline = &out[outindex];
+    }
+
+    return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        /*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+                        setBitOfReversedStream0(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*
+    After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+    to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+    for the Adam7 code, the color convert code and the output to the user.
+    in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+    have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+    also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+    only useful if (ilinebits - olinebits) is a value in the range 1..7
+    */
+    unsigned y;
+    size_t diff = ilinebits - olinebits;
+    size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+    for (y = 0; y < h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < olinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        ibp += diff;
+    }
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+    unsigned w, unsigned h, const LodePNGInfo* info_png)
+{
+    /*
+    This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+    Steps:
+    *) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+    *) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+    NOTE: the in buffer will be overwritten with intermediate data!
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    if (bpp == 0) return 31; /*error: invalid colortype*/
+
+    if (info_png->interlace_method == 0)
+    {
+        if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+        {
+            CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+            removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+        }
+        /*we can immediately filter into the out buffer, no other steps needed*/
+        else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned i;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        for (i = 0; i != 7; ++i)
+        {
+            CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+            /*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+            move bytes instead of bits or move not at all*/
+            if (bpp < 8)
+            {
+                /*remove padding bits in scanlines; after this there still may be padding
+                bits between the different reduced images: each reduced image still starts nicely at a byte*/
+                removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+                    ((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+            }
+        }
+
+        Adam7_deinterlace(out, in, w, h, bpp);
+    }
+
+    return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned pos = 0, i;
+    if (color->palette) lodepng_free(color->palette);
+    color->palettesize = chunkLength / 3;
+    color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+    if (!color->palette && color->palettesize)
+    {
+        color->palettesize = 0;
+        return 83; /*alloc fail*/
+    }
+    if (color->palettesize > 256) return 38; /*error: palette too big*/
+
+    for (i = 0; i != color->palettesize; ++i)
+    {
+        color->palette[4 * i + 0] = data[pos++]; /*R*/
+        color->palette[4 * i + 1] = data[pos++]; /*G*/
+        color->palette[4 * i + 2] = data[pos++]; /*B*/
+        color->palette[4 * i + 3] = 255; /*alpha*/
+    }
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned i;
+    if (color->colortype == LCT_PALETTE)
+    {
+        /*error: more alpha values given than there are palette entries*/
+        if (chunkLength > color->palettesize) return 38;
+
+        for (i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+    }
+    else if (color->colortype == LCT_GREY)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 30;
+
+        color->key_defined = 1;
+        color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+    }
+    else if (color->colortype == LCT_RGB)
+    {
+        /*error: this chunk must be 6 bytes for RGB image*/
+        if (chunkLength != 6) return 41;
+
+        color->key_defined = 1;
+        color->key_r = 256u * data[0] + data[1];
+        color->key_g = 256u * data[2] + data[3];
+        color->key_b = 256u * data[4] + data[5];
+    }
+    else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+    return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (info->color.colortype == LCT_PALETTE)
+    {
+        /*error: this chunk must be 1 byte for indexed color image*/
+        if (chunkLength != 1) return 43;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = data[0];
+    }
+    else if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 44;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        /*error: this chunk must be 6 bytes for greyscale image*/
+        if (chunkLength != 6) return 45;
+
+        info->background_defined = 1;
+        info->background_r = 256u * data[0] + data[1];
+        info->background_g = 256u * data[2] + data[3];
+        info->background_b = 256u * data[4] + data[5];
+    }
+
+    return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    char *key = 0, *str = 0;
+    unsigned i;
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        unsigned length, string2_begin;
+
+        length = 0;
+        while (length < chunkLength && data[length] != 0) ++length;
+        /*even though it's not allowed by the standard, no error is thrown if
+        there's no null termination char, if the text is empty*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        string2_begin = length + 1; /*skip keyword null terminator*/
+
+        length = chunkLength < string2_begin ? 0 : chunkLength - string2_begin;
+        str = (char*)lodepng_malloc(length + 1);
+        if (!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        str[length] = 0;
+        for (i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+        error = lodepng_add_text(info, key, str);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(str);
+
+    return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, string2_begin;
+    char *key = 0;
+    ucvector decoded;
+
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        if (data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+        string2_begin = length + 2;
+        if (string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+        length = chunkLength - string2_begin;
+        /*will fail if zlib error, e.g. if length is too small*/
+        error = zlib_decompress(&decoded.data, &decoded.size,
+            (unsigned char*)(&data[string2_begin]),
+            length, zlibsettings);
+        if (error) break;
+        ucvector_push_back(&decoded, 0);
+
+        error = lodepng_add_text(info, key, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, begin, compressed;
+    char *key = 0, *langtag = 0, *transkey = 0;
+    ucvector decoded;
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        /*Quick check if the chunk length isn't too small. Even without check
+        it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+        if (chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+                                                      /*read the key*/
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        /*read the compression method*/
+        compressed = data[length + 1];
+        if (data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+                                                            /*even though it's not allowed by the standard, no error is thrown if
+                                                            there's no null termination char, if the text is empty for the next 3 texts*/
+
+                                                            /*read the langtag*/
+        begin = length + 3;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        langtag = (char*)lodepng_malloc(length + 1);
+        if (!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        langtag[length] = 0;
+        for (i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+        /*read the transkey*/
+        begin += length + 1;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        transkey = (char*)lodepng_malloc(length + 1);
+        if (!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        transkey[length] = 0;
+        for (i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+        /*read the actual text*/
+        begin += length + 1;
+
+        length = chunkLength < begin ? 0 : chunkLength - begin;
+
+        if (compressed)
+        {
+            /*will fail if zlib error, e.g. if length is too small*/
+            error = zlib_decompress(&decoded.data, &decoded.size,
+                (unsigned char*)(&data[begin]),
+                length, zlibsettings);
+            if (error) break;
+            if (decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+            ucvector_push_back(&decoded, 0);
+        }
+        else
+        {
+            if (!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+            decoded.data[length] = 0;
+            for (i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+        }
+
+        error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(langtag);
+    lodepng_free(transkey);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+    info->time_defined = 1;
+    info->time.year = 256u * data[0] + data[1];
+    info->time.month = data[2];
+    info->time.day = data[3];
+    info->time.hour = data[4];
+    info->time.minute = data[5];
+    info->time.second = data[6];
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+    info->phys_defined = 1;
+    info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+    info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+    info->phys_unit = data[8];
+
+    return 0; /* OK */
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    unsigned char IEND = 0;
+    const unsigned char* chunk;
+    size_t i;
+    ucvector idat; /*the data from idat chunks*/
+    ucvector scanlines;
+    size_t predict;
+    size_t numpixels;
+    size_t outsize = 0;
+
+    /*for unknown chunk order*/
+    unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                               /*provide some proper output values if error will happen*/
+    *out = 0;
+
+    state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+    if (state->error) return;
+
+    numpixels = *w * *h;
+
+    /*multiplication overflow*/
+    if (*h != 0 && numpixels / *h != *w) CERROR_RETURN(state->error, 92);
+    /*multiplication overflow possible further below. Allows up to 2^31-1 pixel
+    bytes with 16-bit RGBA, the rest is room for filter bytes.*/
+    if (numpixels > 268435455) CERROR_RETURN(state->error, 92);
+
+    ucvector_init(&idat);
+    chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+                     /*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+                     IDAT data is put at the start of the in buffer*/
+    while (!IEND && !state->error)
+    {
+        unsigned chunkLength;
+        const unsigned char* data; /*the data in the chunk*/
+
+                                   /*error: size of the in buffer too small to contain next chunk*/
+        if ((size_t)((chunk - in) + 12) > insize || chunk < in) CERROR_BREAK(state->error, 30);
+
+        /*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+        chunkLength = lodepng_chunk_length(chunk);
+        /*error: chunk length larger than the max PNG chunk size*/
+        if (chunkLength > 2147483647) CERROR_BREAK(state->error, 63);
+
+        if ((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in)
+        {
+            CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+        }
+
+        data = lodepng_chunk_data_const(chunk);
+
+        /*IDAT chunk, containing compressed image data*/
+        if (lodepng_chunk_type_equals(chunk, "IDAT"))
+        {
+            size_t oldsize = idat.size;
+            if (!ucvector_resize(&idat, oldsize + chunkLength)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+            for (i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*IEND chunk*/
+        else if (lodepng_chunk_type_equals(chunk, "IEND"))
+        {
+            IEND = 1;
+        }
+        /*palette chunk (PLTE)*/
+        else if (lodepng_chunk_type_equals(chunk, "PLTE"))
+        {
+            state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*palette transparency chunk (tRNS)*/
+        else if (lodepng_chunk_type_equals(chunk, "tRNS"))
+        {
+            state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*background color chunk (bKGD)*/
+        else if (lodepng_chunk_type_equals(chunk, "bKGD"))
+        {
+            state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        /*text chunk (tEXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "tEXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*compressed text chunk (zTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "zTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*international text chunk (iTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "iTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        else if (lodepng_chunk_type_equals(chunk, "tIME"))
+        {
+            state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        else if (lodepng_chunk_type_equals(chunk, "pHYs"))
+        {
+            state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        else /*it's not an implemented chunk type, so ignore it: skip over the data*/
+        {
+            /*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+            if (!lodepng_chunk_ancillary(chunk)) CERROR_BREAK(state->error, 69);
+
+            unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            if (state->decoder.remember_unknown_chunks)
+            {
+                state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+                    &state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+                if (state->error) break;
+            }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+
+        if (!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/
+        {
+            if (lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+        }
+
+        if (!IEND) chunk = lodepng_chunk_next_const(chunk);
+    }
+
+    ucvector_init(&scanlines);
+    /*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+    If the decompressed size does not match the prediction, the image must be corrupt.*/
+    if (state->info_png.interlace_method == 0)
+    {
+        /*The extra *h is added because this are the filter bytes every scanline starts with*/
+        predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color) + *h;
+    }
+    else
+    {
+        /*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+        const LodePNGColorMode* color = &state->info_png.color;
+        predict = 0;
+        predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        if (*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color) + ((*h + 3) >> 3);
+        if (*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color) + ((*h + 3) >> 2);
+        predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color) + ((*h + 1) >> 2);
+        if (*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color) + ((*h + 1) >> 1);
+        predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color) + ((*h + 0) >> 1);
+    }
+    if (!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+    if (!state->error)
+    {
+        state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+            idat.size, &state->decoder.zlibsettings);
+        if (!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+    }
+    ucvector_cleanup(&idat);
+
+    if (!state->error)
+    {
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!*out) state->error = 83; /*alloc fail*/
+    }
+    if (!state->error)
+    {
+        for (i = 0; i < outsize; i++) (*out)[i] = 0;
+        state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+    }
+    ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    *out = 0;
+    decodeGeneric(out, w, h, state, in, insize);
+    if (state->error) return state->error;
+    if (!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color))
+    {
+        /*same color type, no copying or converting of data needed*/
+        /*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+        the raw image has to the end user*/
+        if (!state->decoder.color_convert)
+        {
+            state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+            if (state->error) return state->error;
+        }
+    }
+    else
+    {
+        /*color conversion needed; sort of copy of the data*/
+        unsigned char* data = *out;
+        size_t outsize;
+
+        /*TODO: check if this works according to the statement in the documentation: "The converter can convert
+        from greyscale input color type, to 8-bit greyscale or greyscale with alpha"*/
+        if (!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+            && !(state->info_raw.bitdepth == 8))
+        {
+            return 56; /*unsupported color mode conversion*/
+        }
+
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!(*out))
+        {
+            state->error = 83; /*alloc fail*/
+        }
+        else state->error = lodepng_convert(*out, data, &state->info_raw,
+            &state->info_png.color, *w, *h);
+        lodepng_free(data);
+    }
+    return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+    size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    error = lodepng_decode(out, w, h, &state, in, insize);
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer = 0;
+    size_t buffersize;
+    unsigned error;
+    error = lodepng_load_file(&buffer, &buffersize, filename);
+    if (!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings)
+{
+    settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->read_text_chunks = 1;
+    settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    settings->ignore_crc = 0;
+    lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state)
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    lodepng_color_mode_init(&state->info_raw);
+    lodepng_info_init(&state->info_png);
+    state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state)
+{
+    lodepng_color_mode_cleanup(&state->info_raw);
+    lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source)
+{
+    lodepng_state_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->info_raw);
+    lodepng_info_init(&dest->info_png);
+    dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if (dest->error) return;
+    dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if (dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length)
+{
+    CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+    out->allocsize = out->size; /*fix the allocsize again*/
+    return 0;
+}
+
+static void writeSignature(ucvector* out)
+{
+    /*8 bytes PNG signature, aka the magic bytes*/
+    ucvector_push_back(out, 137);
+    ucvector_push_back(out, 80);
+    ucvector_push_back(out, 78);
+    ucvector_push_back(out, 71);
+    ucvector_push_back(out, 13);
+    ucvector_push_back(out, 10);
+    ucvector_push_back(out, 26);
+    ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method)
+{
+    unsigned error = 0;
+    ucvector header;
+    ucvector_init(&header);
+
+    lodepng_add32bitInt(&header, w); /*width*/
+    lodepng_add32bitInt(&header, h); /*height*/
+    ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+    ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+    ucvector_push_back(&header, 0); /*compression method*/
+    ucvector_push_back(&header, 0); /*filter method*/
+    ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+    error = addChunk(out, "IHDR", header.data, header.size);
+    ucvector_cleanup(&header);
+
+    return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector PLTE;
+    ucvector_init(&PLTE);
+    for (i = 0; i != info->palettesize * 4; ++i)
+    {
+        /*add all channels except alpha channel*/
+        if (i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+    }
+    error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+    ucvector_cleanup(&PLTE);
+
+    return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector tRNS;
+    ucvector_init(&tRNS);
+    if (info->colortype == LCT_PALETTE)
+    {
+        size_t amount = info->palettesize;
+        /*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+        for (i = info->palettesize; i != 0; --i)
+        {
+            if (info->palette[4 * (i - 1) + 3] == 255) --amount;
+            else break;
+        }
+        /*add only alpha channel*/
+        for (i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+    }
+    else if (info->colortype == LCT_GREY)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+        }
+    }
+    else if (info->colortype == LCT_RGB)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+        }
+    }
+
+    error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+    ucvector_cleanup(&tRNS);
+
+    return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+    LodePNGCompressSettings* zlibsettings)
+{
+    ucvector zlibdata;
+    unsigned error = 0;
+
+    /*compress with the Zlib compressor*/
+    ucvector_init(&zlibdata);
+    error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+    if (!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+    ucvector_cleanup(&zlibdata);
+
+    return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out)
+{
+    unsigned error = 0;
+    error = addChunk(out, "IEND", 0, 0);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector text;
+    ucvector_init(&text);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&text, 0); /*0 termination char*/
+    for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+    error = addChunk(out, "tEXt", text.data, text.size);
+    ucvector_cleanup(&text);
+
+    return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+    LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data, compressed;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+    ucvector_init(&compressed);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*0 termination char*/
+    ucvector_push_back(&data, 0); /*compression method: 0*/
+
+    error = zlib_compress(&compressed.data, &compressed.size,
+        (unsigned char*)textstring, textsize, zlibsettings);
+    if (!error)
+    {
+        for (i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+        error = addChunk(out, "zTXt", data.data, data.size);
+    }
+
+    ucvector_cleanup(&compressed);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+    const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*null termination char*/
+    ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+    ucvector_push_back(&data, 0); /*compression method*/
+    for (i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+    for (i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+
+    if (compressed)
+    {
+        ucvector compressed_data;
+        ucvector_init(&compressed_data);
+        error = zlib_compress(&compressed_data.data, &compressed_data.size,
+            (unsigned char*)textstring, textsize, zlibsettings);
+        if (!error)
+        {
+            for (i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+        }
+        ucvector_cleanup(&compressed_data);
+    }
+    else /*not compressed*/
+    {
+        for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+    }
+
+    if (!error) error = addChunk(out, "iTXt", data.data, data.size);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector bKGD;
+    ucvector_init(&bKGD);
+    if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+    }
+    else if (info->color.colortype == LCT_PALETTE)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+    }
+
+    error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+    ucvector_cleanup(&bKGD);
+
+    return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time)
+{
+    unsigned error = 0;
+    unsigned char* data = (unsigned char*)lodepng_malloc(7);
+    if (!data) return 83; /*alloc fail*/
+    data[0] = (unsigned char)(time->year >> 8);
+    data[1] = (unsigned char)(time->year & 255);
+    data[2] = (unsigned char)time->month;
+    data[3] = (unsigned char)time->day;
+    data[4] = (unsigned char)time->hour;
+    data[5] = (unsigned char)time->minute;
+    data[6] = (unsigned char)time->second;
+    error = addChunk(out, "tIME", data, 7);
+    lodepng_free(data);
+    return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector data;
+    ucvector_init(&data);
+
+    lodepng_add32bitInt(&data, info->phys_x);
+    lodepng_add32bitInt(&data, info->phys_y);
+    ucvector_push_back(&data, info->phys_unit);
+
+    error = addChunk(out, "pHYs", data.data, data.size);
+    ucvector_cleanup(&data);
+
+    return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+    size_t length, size_t bytewidth, unsigned char filterType)
+{
+    size_t i;
+    switch (filterType)
+    {
+    case 0: /*None*/
+        for (i = 0; i != length; ++i) out[i] = scanline[i];
+        break;
+    case 1: /*Sub*/
+        for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+        break;
+    case 2: /*Up*/
+        if (prevline)
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i];
+        }
+        break;
+    case 3: /*Average*/
+        if (prevline)
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+        }
+        break;
+    case 4: /*Paeth*/
+        if (prevline)
+        {
+            /*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+            for (i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+            for (i = bytewidth; i < length; ++i)
+            {
+                out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            /*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+            for (i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+        }
+        break;
+    default: return; /*unexisting filter type given*/
+    }
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f)
+{
+    float result = 0;
+    while (f > 32) { result += 4; f /= 16; }
+    while (f > 2) { ++result; f /= 2; }
+    return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* info, const LodePNGEncoderSettings* settings)
+{
+    /*
+    For PNG filter method 0
+    out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+    the scanlines with 1 extra byte per scanline
+    */
+
+    unsigned bpp = lodepng_get_bpp(info);
+    /*the width of a scanline in bytes, not including the filter type*/
+    size_t linebytes = (w * bpp + 7) / 8;
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    const unsigned char* prevline = 0;
+    unsigned x, y;
+    unsigned error = 0;
+    LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+    /*
+    There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+    *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+    use fixed filtering, with the filter None).
+    * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+    not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+    all five filters and select the filter that produces the smallest sum of absolute values per row.
+    This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+    If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+    but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+    heuristic is used.
+    */
+    if (settings->filter_palette_zero &&
+        (info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+    if (bpp == 0) return 31; /*error: invalid color type*/
+
+    if (strategy == LFS_ZERO)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            out[outindex] = 0; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_MINSUM)
+    {
+        /*adaptive filtering*/
+        size_t sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned char type, bestType = 0;
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        if (!error)
+        {
+            for (y = 0; y != h; ++y)
+            {
+                /*try the 5 filter types*/
+                for (type = 0; type != 5; ++type)
+                {
+                    filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+                    /*calculate the sum of the result*/
+                    sum[type] = 0;
+                    if (type == 0)
+                    {
+                        for (x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+                    }
+                    else
+                    {
+                        for (x = 0; x != linebytes; ++x)
+                        {
+                            /*For differences, each byte should be treated as signed, values above 127 are negative
+                            (converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+                            This means filtertype 0 is almost never chosen, but that is justified.*/
+                            unsigned char s = attempt[type][x];
+                            sum[type] += s < 128 ? s : (255U - s);
+                        }
+                    }
+
+                    /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                    if (type == 0 || sum[type] < smallest)
+                    {
+                        bestType = type;
+                        smallest = sum[type];
+                    }
+                }
+
+                prevline = &in[y * linebytes];
+
+                /*now fill the out values*/
+                out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+                for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+            }
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_ENTROPY)
+    {
+        float sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        float smallest = 0;
+        unsigned type, bestType = 0;
+        unsigned count[256];
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        for (y = 0; y != h; ++y)
+        {
+            /*try the 5 filter types*/
+            for (type = 0; type != 5; ++type)
+            {
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                for (x = 0; x != 256; ++x) count[x] = 0;
+                for (x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+                ++count[type]; /*the filter type itself is part of the scanline*/
+                sum[type] = 0;
+                for (x = 0; x != 256; ++x)
+                {
+                    float p = count[x] / (float)(linebytes + 1);
+                    sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+                }
+                /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || sum[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = sum[type];
+                }
+            }
+
+            prevline = &in[y * linebytes];
+
+            /*now fill the out values*/
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_PREDEFINED)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            unsigned char type = settings->predefined_filters[y];
+            out[outindex] = type; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_BRUTE_FORCE)
+    {
+        /*brute force filter chooser.
+        deflate the scanline after every filter attempt to see which one deflates best.
+        This is very slow and gives only slightly smaller, sometimes even larger, result*/
+        size_t size[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned type = 0, bestType = 0;
+        unsigned char* dummy;
+        LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+        /*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+        to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+        better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+        cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+        zlibsettings.btype = 1;
+        /*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+        images only, so disable it*/
+        zlibsettings.custom_zlib = 0;
+        zlibsettings.custom_deflate = 0;
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+        for (y = 0; y != h; ++y) /*try the 5 filter types*/
+        {
+            for (type = 0; type != 5; ++type)
+            {
+                unsigned testsize = linebytes;
+                /*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                size[type] = 0;
+                dummy = 0;
+                zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+                lodepng_free(dummy);
+                /*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || size[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = size[type];
+                }
+            }
+            prevline = &in[y * linebytes];
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else return 88; /* unknown filter strategy */
+
+    return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*The opposite of the removePaddingBits function
+    olinebits must be >= ilinebits*/
+    unsigned y;
+    size_t diff = olinebits - ilinebits;
+    size_t obp = 0, ibp = 0; /*bit pointers*/
+    for (y = 0; y != h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < ilinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        /*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+        "Use of uninitialised value of size ###" warning from valgrind*/
+        for (x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+    }
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+no padding bits between scanlines, but between reduced images so that each
+reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        setBitOfReversedStream(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+    unsigned w, unsigned h,
+    const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings)
+{
+    /*
+    This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+    *) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+    *) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    unsigned error = 0;
+
+    if (info_png->interlace_method == 0)
+    {
+        *outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            /*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+            if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+            {
+                unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+                if (!padded) error = 83; /*alloc fail*/
+                if (!error)
+                {
+                    addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+                    error = filter(*out, padded, w, h, &info_png->color, settings);
+                }
+                lodepng_free(padded);
+            }
+            else
+            {
+                /*we can immediately filter into the out buffer, no other steps needed*/
+                error = filter(*out, in, w, h, &info_png->color, settings);
+            }
+        }
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7];
+        size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned char* adam7;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        *outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out)) error = 83; /*alloc fail*/
+
+        adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+        if (!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            unsigned i;
+
+            Adam7_interlace(adam7, in, w, h, bpp);
+            for (i = 0; i != 7; ++i)
+            {
+                if (bpp < 8)
+                {
+                    unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+                    if (!padded) ERROR_BREAK(83); /*alloc fail*/
+                    addPaddingBits(padded, &adam7[passstart[i]],
+                        ((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+                    error = filter(&(*out)[filter_passstart[i]], padded,
+                        passw[i], passh[i], &info_png->color, settings);
+                    lodepng_free(padded);
+                }
+                else
+                {
+                    error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+                        passw[i], passh[i], &info_png->color, settings);
+                }
+
+                if (error) break;
+            }
+        }
+
+        lodepng_free(adam7);
+    }
+
+    return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize)
+{
+    size_t i;
+    unsigned key = 0;
+    unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+    for (i = 0; i != palettesize; ++i)
+    {
+        if (!key && palette[4 * i + 3] == 0)
+        {
+            r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+            key = 1;
+            i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+        }
+        else if (palette[4 * i + 3] != 255) return 2;
+        /*when key, no opaque RGB may have key's RGB*/
+        else if (key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+    }
+    return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize)
+{
+    unsigned char* inchunk = data;
+    while ((size_t)(inchunk - data) < datasize)
+    {
+        CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+        out->allocsize = out->size; /*fix the allocsize again*/
+        inchunk = lodepng_chunk_next(inchunk);
+    }
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state)
+{
+    LodePNGInfo info;
+    ucvector outv;
+    unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+    size_t datasize = 0;
+
+    /*provide some proper output values if error will happen*/
+    *out = 0;
+    *outsize = 0;
+    state->error = 0;
+
+    /*check input values validity*/
+    if ((state->info_png.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+        && (state->info_png.color.palettesize == 0 || state->info_png.color.palettesize > 256))
+    {
+        CERROR_RETURN_ERROR(state->error, 68); /*invalid palette size, it is only allowed to be 1-256*/
+    }
+    if (state->encoder.zlibsettings.btype > 2)
+    {
+        CERROR_RETURN_ERROR(state->error, 61); /*error: unexisting btype*/
+    }
+    if (state->info_png.interlace_method > 1)
+    {
+        CERROR_RETURN_ERROR(state->error, 71); /*error: unexisting interlace mode*/
+    }
+    state->error = checkColorValidity(state->info_png.color.colortype, state->info_png.color.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+    state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+
+                                           /* color convert and compute scanline filter types */
+    lodepng_info_init(&info);
+    lodepng_info_copy(&info, &state->info_png);
+    if (state->encoder.auto_convert)
+    {
+        state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+    }
+    if (!state->error)
+    {
+        if (!lodepng_color_mode_equal(&state->info_raw, &info.color))
+        {
+            unsigned char* converted;
+            size_t size = (w * h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+            converted = (unsigned char*)lodepng_malloc(size);
+            if (!converted && size) state->error = 83; /*alloc fail*/
+            if (!state->error)
+            {
+                state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+            }
+            if (!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+            lodepng_free(converted);
+        }
+        else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+    }
+
+    /* output all PNG chunks */
+    ucvector_init(&outv);
+    while (!state->error) /*while only executed once, to break on error*/
+    {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*write signature and chunks*/
+        writeSignature(&outv);
+        /*IHDR*/
+        addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*unknown chunks between IHDR and PLTE*/
+        if (info.unknown_chunks_data[0])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*PLTE*/
+        if (info.color.colortype == LCT_PALETTE)
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        if (state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA))
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        /*tRNS*/
+        if (info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+        if ((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*bKGD (must come between PLTE and the IDAt chunks*/
+        if (info.background_defined) addChunk_bKGD(&outv, &info);
+        /*pHYs (must come before the IDAT chunks)*/
+        if (info.phys_defined) addChunk_pHYs(&outv, &info);
+
+        /*unknown chunks between PLTE and IDAT*/
+        if (info.unknown_chunks_data[1])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*IDAT (multiple IDAT chunks must be consecutive)*/
+        state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+        if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*tIME*/
+        if (info.time_defined) addChunk_tIME(&outv, &info.time);
+        /*tEXt and/or zTXt*/
+        for (i = 0; i != info.text_num; ++i)
+        {
+            if (strlen(info.text_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.text_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            if (state->encoder.text_compression)
+            {
+                addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+            }
+            else
+            {
+                addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+            }
+        }
+        /*LodePNG version id in text chunk*/
+        if (state->encoder.add_id)
+        {
+            unsigned alread_added_id_text = 0;
+            for (i = 0; i != info.text_num; ++i)
+            {
+                if (!strcmp(info.text_keys[i], "LodePNG"))
+                {
+                    alread_added_id_text = 1;
+                    break;
+                }
+            }
+            if (alread_added_id_text == 0)
+            {
+                addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+            }
+        }
+        /*iTXt*/
+        for (i = 0; i != info.itext_num; ++i)
+        {
+            if (strlen(info.itext_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.itext_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            addChunk_iTXt(&outv, state->encoder.text_compression,
+                info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+                &state->encoder.zlibsettings);
+        }
+
+        /*unknown chunks between IDAT and IEND*/
+        if (info.unknown_chunks_data[2])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        addChunk_IEND(&outv);
+
+        break; /*this isn't really a while loop; no error happened so break out now!*/
+    }
+
+    lodepng_info_cleanup(&info);
+    lodepng_free(data);
+    /*instead of cleaning the vector up, give it to the output*/
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+    unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    state.info_png.color.colortype = colortype;
+    state.info_png.color.bitdepth = bitdepth;
+    lodepng_encode(out, outsize, image, w, h, &state);
+    error = state.error;
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer;
+    size_t buffersize;
+    unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+    if (!error) error = lodepng_save_file(buffer, buffersize, filename);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings)
+{
+    lodepng_compress_settings_init(&settings->zlibsettings);
+    settings->filter_palette_zero = 1;
+    settings->filter_strategy = LFS_MINSUM;
+    settings->auto_convert = 1;
+    settings->force_palette = 0;
+    settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->add_id = 0;
+    settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code)
+{
+    switch (code)
+    {
+    case 0: return "no error, everything went ok";
+    case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+    case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+    case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+    case 13: return "problem while processing dynamic deflate block";
+    case 14: return "problem while processing dynamic deflate block";
+    case 15: return "problem while processing dynamic deflate block";
+    case 16: return "unexisting code while processing dynamic deflate block";
+    case 17: return "end of out buffer memory reached while inflating";
+    case 18: return "invalid distance code while inflating";
+    case 19: return "end of out buffer memory reached while inflating";
+    case 20: return "invalid deflate block BTYPE encountered while decoding";
+    case 21: return "NLEN is not ones complement of LEN in a deflate block";
+        /*end of out buffer memory reached while inflating:
+        This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+        all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+        happen in a normal, well encoded, PNG image.*/
+    case 22: return "end of out buffer memory reached while inflating";
+    case 23: return "end of in buffer memory reached while inflating";
+    case 24: return "invalid FCHECK in zlib header";
+    case 25: return "invalid compression method in zlib header";
+    case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+    case 27: return "PNG file is smaller than a PNG header";
+        /*Checks the magic file header, the first 8 bytes of the PNG file*/
+    case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+    case 29: return "first chunk is not the header chunk";
+    case 30: return "chunk length too large, chunk broken off at end of file";
+    case 31: return "illegal PNG color type or bpp";
+    case 32: return "illegal PNG compression method";
+    case 33: return "illegal PNG filter method";
+    case 34: return "illegal PNG interlace method";
+    case 35: return "chunk length of a chunk is too large or the chunk too small";
+    case 36: return "illegal PNG filter type encountered";
+    case 37: return "illegal bit depth for this color type given";
+    case 38: return "the palette is too big"; /*more than 256 colors*/
+    case 39: return "more palette alpha values given in tRNS chunk than there are colors in the palette";
+    case 40: return "tRNS chunk has wrong size for greyscale image";
+    case 41: return "tRNS chunk has wrong size for RGB image";
+    case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+    case 43: return "bKGD chunk has wrong size for palette image";
+    case 44: return "bKGD chunk has wrong size for greyscale image";
+    case 45: return "bKGD chunk has wrong size for RGB image";
+    case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+    case 49: return "jumped past memory while generating dynamic huffman tree";
+    case 50: return "jumped past memory while generating dynamic huffman tree";
+    case 51: return "jumped past memory while inflating huffman block";
+    case 52: return "jumped past memory while inflating";
+    case 53: return "size of zlib data too small";
+    case 54: return "repeat symbol in tree while there was no value symbol yet";
+        /*jumped past tree while generating huffman tree, this could be when the
+        tree will have more leaves than symbols after generating it out of the
+        given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+    case 55: return "jumped past tree while generating huffman tree";
+    case 56: return "given output image colortype or bitdepth not supported for color conversion";
+    case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+    case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+    case 59: return "requested color conversion not supported";
+    case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+    case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+        /*LodePNG leaves the choice of RGB to greyscale conversion formula to the user.*/
+    case 62: return "conversion from color to greyscale not supported";
+    case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk"; /*(2^31-1)*/
+                                                                                                     /*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+    case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+    case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+    case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+    case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+    case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+    case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+    case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+    case 73: return "invalid tIME chunk size";
+    case 74: return "invalid pHYs chunk size";
+        /*length could be wrong, or data chopped off*/
+    case 75: return "no null termination char found while decoding text chunk";
+    case 76: return "iTXt chunk too short to contain required bytes";
+    case 77: return "integer overflow in buffer size";
+    case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+    case 79: return "failed to open file for writing";
+    case 80: return "tried creating a tree of 0 symbols";
+    case 81: return "lazy matching at pos 0 is impossible";
+    case 82: return "color conversion to palette requested while a color isn't in palette";
+    case 83: return "memory allocation failed";
+    case 84: return "given image too small to contain all pixels to be encoded";
+    case 86: return "impossible offset in lz77 encoding (internal bug)";
+    case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+    case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+    case 89: return "text chunk keyword too short or long: must have size 1-79";
+        /*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+    case 90: return "windowsize must be a power of two";
+    case 91: return "invalid decompressed idat size";
+    case 92: return "too many pixels, not supported";
+    case 93: return "zero width or height is invalid";
+    case 94: return "header chunk must have a size of 13 bytes";
+    }
+    return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        long size = lodepng_filesize(filename.c_str());
+        if (size < 0) return 78;
+        buffer.resize((size_t)size);
+        return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+    }
+
+    /*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings)
+    {
+        return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings)
+    {
+        return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+    State::State()
+    {
+        lodepng_state_init(this);
+    }
+
+    State::State(const State& other)
+    {
+        lodepng_state_init(this);
+        lodepng_state_copy(this, &other);
+    }
+
+    State::~State()
+    {
+        lodepng_state_cleanup(this);
+    }
+
+    State& State::operator=(const State& other)
+    {
+        lodepng_state_copy(this, &other);
+        return *this;
+    }
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+        size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+        if (buffer && !error)
+        {
+            State state;
+            state.info_raw.colortype = colortype;
+            state.info_raw.bitdepth = bitdepth;
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize)
+    {
+        unsigned char* buffer = NULL;
+        unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+        if (buffer && !error)
+        {
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+        }
+        lodepng_free(buffer);
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in)
+    {
+        return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = load_file(buffer, filename);
+        if (error) return error;
+        return decode(out, w, h, buffer, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state)
+    {
+        if (lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+        if (!error) error = save_file(buffer, filename);
+        return error;
+    }
+
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.h b/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.h
new file mode 100644
index 000000000..595312ca8
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/lodepng/lodepng.h
@@ -0,0 +1,1762 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType
+{
+    LCT_GREY = 0, /*greyscale: 1,2,4,8,16 bit*/
+    LCT_RGB = 2, /*RGB: 8,16 bit*/
+    LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+    LCT_GREY_ALPHA = 4, /*greyscale with alpha: 8,16 bit*/
+    LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+After decoding, its size is w * h * (bytes per pixel) bytes larger than
+initially. Bytes per pixel depends on colortype and bitdepth.
+Must be freed after usage with free(*out).
+Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+of the output PNG image cannot be chosen, they are automatically determined
+by the colortype, bitdepth and content of the input pixel data.
+Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    /*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+    is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const unsigned char* in, size_t insize,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts PNG file from disk to raw pixel data in memory.
+    Same as the other decode functions, but instead takes a filename as input.
+    */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::string& filename,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+    is that of the raw input data. The output PNG color type will be auto chosen.*/
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+    Same as the other encode functions, but instead takes a filename as output.
+    NOTE: This overwrites existing files without warning!
+    */
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+  /*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings
+{
+    unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+                             /*use custom zlib decoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+    /*use custom deflate decoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_inflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/
+{
+    /*LZ77 related settings*/
+    unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+    unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+    unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+    unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+    unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+    unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+                           /*use custom zlib encoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+    /*use custom deflate encoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_deflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode
+{
+    /*header (IHDR)*/
+    LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+    unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+                        /*
+                        palette (PLTE and tRNS)
+
+                        Dynamically allocated with the colors of the palette, including alpha.
+                        When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+                        lodepng_palette_clear, then for each color use lodepng_palette_add.
+                        If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+                        When decoding, by default you can ignore this palette, since LodePNG already
+                        fills the palette colors in the pixels of the raw RGBA output.
+
+                        The palette is only supported for color type 3.
+                        */
+    unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+    size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+                        /*
+                        transparent color key (tRNS)
+
+                        This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+                        For greyscale PNGs, r, g and b will all 3 be set to the same.
+
+                        When decoding, by default you can ignore this information, since LodePNG sets
+                        pixels with this key to transparent already in the raw RGBA output.
+
+                        The color key is only supported for color types 0 and 2.
+                        */
+    unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+    unsigned key_r;       /*red/greyscale component of color key*/
+    unsigned key_g;       /*green component of color key*/
+    unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a greyscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime
+{
+    unsigned year;    /*2 bytes used (0-65535)*/
+    unsigned month;   /*1-12*/
+    unsigned day;     /*1-31*/
+    unsigned hour;    /*0-23*/
+    unsigned minute;  /*0-59*/
+    unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo
+{
+    /*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+    unsigned compression_method;/*compression method of the original file. Always 0.*/
+    unsigned filter_method;     /*filter method of the original file*/
+    unsigned interlace_method;  /*interlace method of the original file*/
+    LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+                                /*
+                                suggested background color chunk (bKGD)
+                                This color uses the same color mode as the PNG (except alpha channel), which can be 1-bit to 16-bit.
+
+                                For greyscale PNGs, r, g and b will all 3 be set to the same. When encoding
+                                the encoder writes the red one. For palette PNGs: When decoding, the RGB value
+                                will be stored, not a palette index. But when encoding, specify the index of
+                                the palette in background_r, the other two are then ignored.
+
+                                The decoder does not use this background color to edit the color of pixels.
+                                */
+    unsigned background_defined; /*is a suggested background color given?*/
+    unsigned background_r;       /*red component of suggested background color*/
+    unsigned background_g;       /*green component of suggested background color*/
+    unsigned background_b;       /*blue component of suggested background color*/
+
+                                 /*
+                                 non-international text chunks (tEXt and zTXt)
+
+                                 The char** arrays each contain num strings. The actual messages are in
+                                 text_strings, while text_keys are keywords that give a short description what
+                                 the actual text represents, e.g. Title, Author, Description, or anything else.
+
+                                 A keyword is minimum 1 character and maximum 79 characters long. It's
+                                 discouraged to use a single line length longer than 79 characters for texts.
+
+                                 Don't allocate these text buffers yourself. Use the init/cleanup functions
+                                 correctly and use lodepng_add_text and lodepng_clear_text.
+                                 */
+    size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+    char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+    char** text_strings; /*the actual text*/
+
+                         /*
+                         international text chunks (iTXt)
+                         Similar to the non-international text chunks, but with additional strings
+                         "langtags" and "transkeys".
+                         */
+    size_t itext_num; /*the amount of international texts in this PNG*/
+    char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+    char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+    char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+    char** itext_strings; /*the actual international text - UTF-8 string*/
+
+                          /*time chunk (tIME)*/
+    unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+    LodePNGTime time;
+
+    /*phys chunk (pHYs)*/
+    unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+    unsigned phys_x; /*pixels per unit in x direction*/
+    unsigned phys_y; /*pixels per unit in y direction*/
+    unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+                        /*
+                        unknown chunks
+                        There are 3 buffers, one for each position in the PNG where unknown chunks can appear
+                        each buffer contains all unknown chunks for that position consecutively
+                        The 3 buffers are the unknown chunks between certain critical chunks:
+                        0: IHDR-PLTE, 1: PLTE-IDAT, 2: IDAT-IEND
+                        Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+                        later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+                        */
+    unsigned char* unknown_chunks_data[3];
+    size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                                            /*
+                                            Converts raw buffer from one color type to another color type, based on
+                                            LodePNGColorMode structs to describe the input and output color type.
+                                            See the reference manual at the end of this header file to see which color conversions are supported.
+                                            return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+                                            The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+                                            of the output color type (lodepng_get_bpp).
+                                            For < 8 bpp images, there should not be padding bits at the end of scanlines.
+                                            For 16-bit per channel colors, uses big endian format like PNG does.
+                                            Return value is LodePNG error code
+                                            */
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings
+{
+    LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+    unsigned ignore_crc; /*ignore CRC checksums*/
+
+    unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+                               /*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+    unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy
+{
+    /*every filter at zero*/
+    LFS_ZERO,
+    /*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+    LFS_MINSUM,
+    /*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+    on the image, this is better or worse than minsum.*/
+    LFS_ENTROPY,
+    /*
+    Brute-force-search PNG filters by compressing each filter for each scanline.
+    Experimental, very slow, and only rarely gives better compression than MINSUM.
+    */
+    LFS_BRUTE_FORCE,
+    /*use predefined_filters buffer: you specify the filter type for each scanline*/
+    LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the colors of the image, which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.*/
+typedef struct LodePNGColorProfile
+{
+    unsigned colored; /*not greyscale*/
+    unsigned key; /*image is not opaque and color key is possible instead of full alpha*/
+    unsigned short key_r; /*key values, always as 16-bit, in 8-bit case the byte is duplicated, e.g. 65535 means 255*/
+    unsigned short key_g;
+    unsigned short key_b;
+    unsigned alpha; /*image is not opaque and alpha channel or alpha palette required*/
+    unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+    unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+    unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for greyscale only. 16 if 16-bit per channel required.*/
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. grey if only grey pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings
+{
+    LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+    unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+                           /*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+                           8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+                           completely follow the official PNG heuristic, filter_palette_zero must be true and
+                           filter_strategy must be LFS_MINSUM*/
+    unsigned filter_palette_zero;
+    /*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+    Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+    LodePNGFilterStrategy filter_strategy;
+    /*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+    the same length as the amount of scanlines in the image, and each value must <= 5. You
+    have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+    must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+    const unsigned char* predefined_filters;
+
+    /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+    If colortype is 3, PLTE is _always_ created.*/
+    unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*add LodePNG identifier and version as a text chunk, for debugging*/
+    unsigned add_id;
+    /*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+    unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+    LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+    unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+    /* For the lodepng::State subclass. */
+    virtual ~LodePNGState() {}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the header chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+PNG standard chunk naming conventions:
+First byte: uppercase = critical, lowercase = ancillary
+Second byte: uppercase = public, lowercase = private
+Third byte: must be uppercase
+Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*iterate to next chunks. don't use on IEND chunk, as there is no next chunk then*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_PNG
+    class State : public LodePNGState
+    {
+    public:
+        State();
+        State(const State& other);
+        virtual ~State();
+        State& operator=(const State& other);
+    };
+
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Same as other lodepng::decode, but using a State for more settings and information. */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Same as other lodepng::encode, but using a State for more settings and information. */
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Load a file from disk into an std::vector.
+    return value: error code (0 means ok)
+    */
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+    /*
+    Save the binary data in an std::vector to a file on disk. The file is overwritten
+    without warning.
+    */
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Zlib-decompress an unsigned char buffer */
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+    /* Zlib-decompress an std::vector */
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Zlib-compress an unsigned char buffer */
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+    /* Zlib-compress an std::vector */
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+  /*
+  TODO:
+  [.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+  [.] check compatibility with various compilers  - done but needs to be redone for every newer version
+  [X] converting color to 16-bit per channel types
+  [ ] read all public PNG chunk types (but never let the color profile and gamma ones touch RGB values)
+  [ ] make sure encoder generates no chunks with size > (2^31)-1
+  [ ] partial decoding (stream processing)
+  [X] let the "isFullyOpaque" function check color keys and transparent palettes too
+  [X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+  [ ] don't stop decoding on errors like 69, 57, 58 (make warnings)
+  [ ] make warnings like: oob palette, checksum fail, data after iend, wrong/unknown crit chunk, no null terminator in text, ...
+  [ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+  [ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+  [ ] allow user to give data (void*) to custom allocator
+  */
+
+#endif /*LODEPNG_H inclusion guard*/
+
+  /*
+  LodePNG Documentation
+  ---------------------
+
+  0. table of contents
+  --------------------
+
+  1. about
+  1.1. supported features
+  1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+  6.1. PNG color types
+  6.2. color conversions
+  6.3. padding bits
+  6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+  10.1. decoder C++ example
+  10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+  1. about
+  --------
+
+  PNG is a file format to store raster images losslessly with good compression,
+  supporting different color types and alpha channel.
+
+  LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+  Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+  The specifications used are:
+
+  *) Portable Network Graphics (PNG) Specification (Second Edition):
+  http://www.w3.org/TR/2003/REC-PNG-20031110
+  *) RFC 1950 ZLIB Compressed Data Format version 3.3:
+  http://www.gzip.org/zlib/rfc-zlib.html
+  *) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+  http://www.gzip.org/zlib/rfc-deflate.html
+
+  The most recent version of LodePNG can currently be found at
+  http://lodev.org/lodepng/
+
+  LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+  extra functionality.
+
+  LodePNG exists out of two files:
+  -lodepng.h: the header file for both C and C++
+  -lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+  If you want to start using LodePNG right away without reading this doc, get the
+  examples from the LodePNG website to see how to use it in code, or check the
+  smaller examples in chapter 13 here.
+
+  LodePNG is simple but only supports the basic requirements. To achieve
+  simplicity, the following design choices were made: There are no dependencies
+  on any external library. There are functions to decode and encode a PNG with
+  a single function call, and extended versions of these functions taking a
+  LodePNGState struct allowing to specify or get more information. By default
+  the colors of the raw image are always RGB or RGBA, no matter what color type
+  the PNG file uses. To read and write files, there are simple functions to
+  convert the files to/from buffers in memory.
+
+  This all makes LodePNG suitable for loading textures in games, demos and small
+  programs, ... It's less suitable for full fledged image editors, loading PNGs
+  over network (it requires all the image data to be available before decoding can
+  begin), life-critical systems, ...
+
+  1.1. supported features
+  -----------------------
+
+  The following features are supported by the decoder:
+
+  *) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+  or the same color type as the PNG
+  *) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+  *) Adam7 interlace and deinterlace for any color type
+  *) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+  *) support for alpha channels, including RGBA color model, translucent palettes and color keying
+  *) zlib decompression (inflate)
+  *) zlib compression (deflate)
+  *) CRC32 and ADLER32 checksums
+  *) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+  *) the following chunks are supported (generated/interpreted) by both encoder and decoder:
+  IHDR: header information
+  PLTE: color palette
+  IDAT: pixel data
+  IEND: the final chunk
+  tRNS: transparency for palettized images
+  tEXt: textual information
+  zTXt: compressed textual information
+  iTXt: international textual information
+  bKGD: suggested background color
+  pHYs: physical dimensions
+  tIME: modification time
+
+  1.2. features not supported
+  ---------------------------
+
+  The following features are _not_ supported:
+
+  *) some features needed to make a conformant PNG-Editor might be still missing.
+  *) partial loading/stream processing. All data must be available and is processed in one call.
+  *) The following public chunks are not supported but treated as unknown chunks by LodePNG
+  cHRM, gAMA, iCCP, sRGB, sBIT, hIST, sPLT
+  Some of these are not supported on purpose: LodePNG wants to provide the RGB values
+  stored in the pixels, not values modified by system dependent gamma or color models.
+
+
+  2. C and C++ version
+  --------------------
+
+  The C version uses buffers allocated with alloc that you need to free()
+  yourself. You need to use init and cleanup functions for each struct whenever
+  using a struct from the C version to avoid exploits and memory leaks.
+
+  The C++ version has extra functions with std::vectors in the interface and the
+  lodepng::State class which is a LodePNGState with constructor and destructor.
+
+  These files work without modification for both C and C++ compilers because all
+  the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+  ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+  To use the C++ version, you need to rename the source file to lodepng.cpp
+  (instead of lodepng.c), and compile it with a C++ compiler.
+
+  To use the C version, you need to rename the source file to lodepng.c (instead
+  of lodepng.cpp), and compile it with a C compiler.
+
+
+  3. Security
+  -----------
+
+  Even if carefully designed, it's always possible that LodePNG contains possible
+  exploits. If you discover one, please let me know, and it will be fixed.
+
+  When using LodePNG, care has to be taken with the C version of LodePNG, as well
+  as the C-style structs when working with C++. The following conventions are used
+  for all C-style structs:
+
+  -if a struct has a corresponding init function, always call the init function when making a new one
+  -if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+  -if a struct has a corresponding copy function, use the copy function instead of "=".
+  The destination must also be inited already.
+
+
+  4. Decoding
+  -----------
+
+  Decoding converts a PNG compressed image to a raw pixel buffer.
+
+  Most documentation on using the decoder is at its declarations in the header
+  above. For C, simple decoding can be done with functions such as
+  lodepng_decode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+  various lodepng::decode functions, and lodepng::State can be used for advanced
+  features.
+
+  When using the LodePNGState, it uses the following fields for decoding:
+  *) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+  *) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+  *) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  After decoding, this contains extra information of the PNG image, except the actual
+  pixels, width and height because these are already gotten directly from the decoder
+  functions.
+
+  It contains for example the original color type of the PNG image, text comments,
+  suggested background color, etc... More details about the LodePNGInfo struct are
+  at its declaration documentation.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  When decoding, here you can specify which color type you want
+  the resulting raw image to be. If this is different from the colortype of the
+  PNG, then the decoder will automatically convert the result. This conversion
+  always works, except if you want it to convert a color PNG to greyscale or to
+  a palette with missing colors.
+
+  By default, 32-bit color is used for the result.
+
+  LodePNGDecoderSettings decoder
+  ------------------------------
+
+  The settings can be used to ignore the errors created by invalid CRC and Adler32
+  chunks, and to disable the decoding of tEXt chunks.
+
+  There's also a setting color_convert, true by default. If false, no conversion
+  is done, the resulting data will be as it was in the PNG (after decompression)
+  and you'll have to puzzle the colors of the pixels together yourself using the
+  color type information in the LodePNGInfo.
+
+
+  5. Encoding
+  -----------
+
+  Encoding converts a raw pixel buffer to a PNG compressed image.
+
+  Most documentation on using the encoder is at its declarations in the header
+  above. For C, simple encoding can be done with functions such as
+  lodepng_encode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+  various lodepng::encode functions, and lodepng::State can be used for advanced
+  features.
+
+  Like the decoder, the encoder can also give errors. However it gives less errors
+  since the encoder input is trusted, the decoder input (a PNG image that could
+  be forged by anyone) is not trusted.
+
+  When using the LodePNGState, it uses the following fields for encoding:
+  *) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+  *) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+  *) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  When encoding, you use this the opposite way as when decoding: for encoding,
+  you fill in the values you want the PNG to have before encoding. By default it's
+  not needed to specify a color type for the PNG since it's automatically chosen,
+  but it's possible to choose it yourself given the right settings.
+
+  The encoder will not always exactly match the LodePNGInfo struct you give,
+  it tries as close as possible. Some things are ignored by the encoder. The
+  encoder uses, for example, the following settings from it when applicable:
+  colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+  background color, the interlace method, unknown chunks, ...
+
+  When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+  If the palette contains any colors for which the alpha channel is not 255 (so
+  there are translucent colors in the palette), it'll add a tRNS chunk.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  You specify the color type of the raw image that you give to the input here,
+  including a possible transparent color key and palette you happen to be using in
+  your raw image data.
+
+  By default, 32-bit color is assumed, meaning your input has to be in RGBA
+  format with 4 bytes (unsigned chars) per pixel.
+
+  LodePNGEncoderSettings encoder
+  ------------------------------
+
+  The following settings are supported (some are in sub-structs):
+  *) auto_convert: when this option is enabled, the encoder will
+  automatically choose the smallest possible color mode (including color key) that
+  can encode the colors of all pixels without information loss.
+  *) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+  2 = dynamic huffman tree (best compression). Should be 2 for proper
+  compression.
+  *) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+  true for proper compression.
+  *) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+  2048 by default, but can be set to 32768 for better, but slow, compression.
+  *) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+  chunk if force_palette is true. This can used as suggested palette to convert
+  to by viewers that don't support more than 256 colors (if those still exist)
+  *) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+  *) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+  6. color conversions
+  --------------------
+
+  An important thing to note about LodePNG, is that the color type of the PNG, and
+  the color type of the raw image, are completely independent. By default, when
+  you decode a PNG, you get the result as a raw image in the color type you want,
+  no matter whether the PNG was encoded with a palette, greyscale or RGBA color.
+  And if you encode an image, by default LodePNG will automatically choose the PNG
+  color type that gives good compression based on the values of colors and amount
+  of colors in the image. It can be configured to let you control it instead as
+  well, though.
+
+  To be able to do this, LodePNG does conversions from one color mode to another.
+  It can convert from almost any color type to any other color type, except the
+  following conversions: RGB to greyscale is not supported, and converting to a
+  palette when the palette doesn't have a required color is not supported. This is
+  not supported on purpose: this is information loss which requires a color
+  reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to grey
+  is easy, but there are multiple ways if you want to give some channels more
+  weight).
+
+  By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+  color, no matter what color type the PNG has. And by default when encoding,
+  LodePNG automatically picks the best color model for the output PNG, and expects
+  the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+  the color format of the images yourself, you can skip this chapter.
+
+  6.1. PNG color types
+  --------------------
+
+  A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+  as well as palettized color modes. After the zlib decompression and unfiltering
+  in the PNG image is done, the raw pixel data will have that color type and thus
+  a certain amount of bits per pixel. If you want the output raw image after
+  decoding to have another color type, a conversion is done by LodePNG.
+
+  The PNG specification gives the following color types:
+
+  0: greyscale, bit depths 1, 2, 4, 8, 16
+  2: RGB, bit depths 8 and 16
+  3: palette, bit depths 1, 2, 4 and 8
+  4: greyscale with alpha, bit depths 8 and 16
+  6: RGBA, bit depths 8 and 16
+
+  Bit depth is the amount of bits per pixel per color channel. So the total amount
+  of bits per pixel is: amount of channels * bitdepth.
+
+  6.2. color conversions
+  ----------------------
+
+  As explained in the sections about the encoder and decoder, you can specify
+  color types and bit depths in info_png and info_raw to change the default
+  behaviour.
+
+  If, when decoding, you want the raw image to be something else than the default,
+  you need to set the color type and bit depth you want in the LodePNGColorMode,
+  or the parameters colortype and bitdepth of the simple decoding function.
+
+  If, when encoding, you use another color type than the default in the raw input
+  image, you need to specify its color type and bit depth in the LodePNGColorMode
+  of the raw image, or use the parameters colortype and bitdepth of the simple
+  encoding function.
+
+  If, when encoding, you don't want LodePNG to choose the output PNG color type
+  but control it yourself, you need to set auto_convert in the encoder settings
+  to false, and specify the color type you want in the LodePNGInfo of the
+  encoder (including palette: it can generate a palette if auto_convert is true,
+  otherwise not).
+
+  If the input and output color type differ (whether user chosen or auto chosen),
+  LodePNG will do a color conversion, which follows the rules below, and may
+  sometimes result in an error.
+
+  To avoid some confusion:
+  -the decoder converts from PNG to raw image
+  -the encoder converts from raw image to PNG
+  -the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+  -the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+  -when encoding, the color type in LodePNGInfo is ignored if auto_convert
+  is enabled, it is automatically generated instead
+  -when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+  PNG image, but it can be ignored since the raw image has the color type you requested instead
+  -if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+  between the color types is done if the color types are supported. If it is not
+  supported, an error is returned. If the types are the same, no conversion is done.
+  -even though some conversions aren't supported, LodePNG supports loading PNGs from any
+  colortype and saving PNGs to any colortype, sometimes it just requires preparing
+  the raw image correctly before encoding.
+  -both encoder and decoder use the same color converter.
+
+  Non supported color conversions:
+  -color to greyscale: no error is thrown, but the result will look ugly because
+  only the red channel is taken
+  -anything to palette when that palette does not have that color in it: in this
+  case an error is thrown
+
+  Supported color conversions:
+  -anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+  -any grey or grey+alpha, to grey or grey+alpha
+  -anything to a palette, as long as the palette has the requested colors in it
+  -removing alpha channel
+  -higher to smaller bitdepth, and vice versa
+
+  If you want no color conversion to be done (e.g. for speed or control):
+  -In the encoder, you can make it save a PNG with any color type by giving the
+  raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+  false.
+  -In the decoder, you can make it store the pixel data in the same color type
+  as the PNG has, by setting the color_convert setting to false. Settings in
+  info_raw are then ignored.
+
+  The function lodepng_convert does the color conversion. It is available in the
+  interface but normally isn't needed since the encoder and decoder already call
+  it.
+
+  6.3. padding bits
+  -----------------
+
+  In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+  have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+  scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+  The raw input image you give to the encoder, and the raw output image you get from the decoder
+  will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+  of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+  not the first bit of a new byte.
+
+  6.4. A note about 16-bits per channel and endianness
+  ----------------------------------------------------
+
+  LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+  for any other color format. The 16-bit values are stored in big endian (most
+  significant byte first) in these arrays. This is the opposite order of the
+  little endian used by x86 CPU's.
+
+  LodePNG always uses big endian because the PNG file format does so internally.
+  Conversions to other formats than PNG uses internally are not supported by
+  LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+  colors, the order in which you store R, G, B and A, and so on. Supporting and
+  converting to/from all that is outside the scope of LodePNG.
+
+  This may mean that, depending on your use case, you may want to convert the big
+  endian output of LodePNG to little endian with a for loop. This is certainly not
+  always needed, many applications and libraries support big endian 16-bit colors
+  anyway, but it means you cannot simply cast the unsigned char* buffer to an
+  unsigned short* buffer on x86 CPUs.
+
+
+  7. error values
+  ---------------
+
+  All functions in LodePNG that return an error code, return 0 if everything went
+  OK, or a non-zero code if there was an error.
+
+  The meaning of the LodePNG error values can be retrieved with the function
+  lodepng_error_text: given the numerical error code, it returns a description
+  of the error in English as a string.
+
+  Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+  8. chunks and PNG editing
+  -------------------------
+
+  If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+  editor that should follow the rules about handling of unknown chunks, or if your
+  program is able to read other types of chunks than the ones handled by LodePNG,
+  then that's possible with the chunk functions of LodePNG.
+
+  A PNG chunk has the following layout:
+
+  4 bytes length
+  4 bytes type name
+  length bytes data
+  4 bytes CRC
+
+  8.1. iterating through chunks
+  -----------------------------
+
+  If you have a buffer containing the PNG image data, then the first chunk (the
+  IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+  signature of the PNG and are not part of a chunk. But if you start at byte 8
+  then you have a chunk, and can check the following things of it.
+
+  NOTE: none of these functions check for memory buffer boundaries. To avoid
+  exploits, always make sure the buffer contains all the data of the chunks.
+  When using lodepng_chunk_next, make sure the returned value is within the
+  allocated memory.
+
+  unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+  Get the length of the chunk's data. The total chunk length is this length + 12.
+
+  void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+  unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+  Get the type of the chunk or compare if it's a certain type
+
+  unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+  unsigned char lodepng_chunk_private(const unsigned char* chunk):
+  unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+  Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+  Check if the chunk is private (public chunks are part of the standard, private ones not).
+  Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+  chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+  program doesn't handle that type of unknown chunk.
+
+  unsigned char* lodepng_chunk_data(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+  Get a pointer to the start of the data of the chunk.
+
+  unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+  void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+  Check if the crc is correct or generate a correct one.
+
+  unsigned char* lodepng_chunk_next(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+  Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+  functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+  data available in the buffer to be able to go to the next chunk.
+
+  unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+  unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+  const char* type, const unsigned char* data):
+
+  These functions are used to create new chunks that are appended to the data in *out that has
+  length *outlength. The append function appends an existing chunk to the new data. The create
+  function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+  name of the chunk.
+
+  8.2. chunks in info_png
+  -----------------------
+
+  The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+  buffers (each with size) to contain 3 types of unknown chunks:
+  the ones that come before the PLTE chunk, the ones that come between the PLTE
+  and the IDAT chunks, and the ones that come after the IDAT chunks.
+  It's necessary to make the distionction between these 3 cases because the PNG
+  standard forces to keep the ordering of unknown chunks compared to the critical
+  chunks, but does not force any other ordering rules.
+
+  info_png.unknown_chunks_data[0] is the chunks before PLTE
+  info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+  info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+  The chunks in these 3 buffers can be iterated through and read by using the same
+  way described in the previous subchapter.
+
+  When using the decoder to decode a PNG, you can make it store all unknown chunks
+  if you set the option settings.remember_unknown_chunks to 1. By default, this
+  option is off (0).
+
+  The encoder will always encode unknown chunks that are stored in the info_png.
+  If you need it to add a particular chunk that isn't known by LodePNG, you can
+  use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+  info_png.unknown_chunks_data[x].
+
+  Chunks that are known by LodePNG should not be added in that way. E.g. to make
+  LodePNG add a bKGD chunk, set background_defined to true and add the correct
+  parameters there instead.
+
+
+  9. compiler support
+  -------------------
+
+  No libraries other than the current standard C library are needed to compile
+  LodePNG. For the C++ version, only the standard C++ library is needed on top.
+  Add the files lodepng.c(pp) and lodepng.h to your project, include
+  lodepng.h where needed, and your program can read/write PNG files.
+
+  It is compatible with C90 and up, and C++03 and up.
+
+  If performance is important, use optimization when compiling! For both the
+  encoder and decoder, this makes a large difference.
+
+  Make sure that LodePNG is compiled with the same compiler of the same version
+  and with the same settings as the rest of the program, or the interfaces with
+  std::vectors and std::strings in C++ can be incompatible.
+
+  CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+  *) gcc and g++
+
+  LodePNG is developed in gcc so this compiler is natively supported. It gives no
+  warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+  version 4.7.1 on Linux, 32-bit and 64-bit.
+
+  *) Clang
+
+  Fully supported and warning-free.
+
+  *) Mingw
+
+  The Mingw compiler (a port of gcc for Windows) should be fully supported by
+  LodePNG.
+
+  *) Visual Studio and Visual C++ Express Edition
+
+  LodePNG should be warning-free with warning level W4. Two warnings were disabled
+  with pragmas though: warning 4244 about implicit conversions, and warning 4996
+  where it wants to use a non-standard function fopen_s instead of the standard C
+  fopen.
+
+  Visual Studio may want "stdafx.h" files to be included in each source file and
+  give an error "unexpected end of file while looking for precompiled header".
+  This is not standard C++ and will not be added to the stock LodePNG. You can
+  disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+  Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+  NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+  VS6, are not guaranteed to work.
+
+  *) Compilers on Macintosh
+
+  LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+  C and C++.
+
+  *) Other Compilers
+
+  If you encounter problems on any compilers, feel free to let me know and I may
+  try to fix it if the compiler is modern and standards complient.
+
+
+  10. examples
+  ------------
+
+  This decoder example shows the most basic usage of LodePNG. More complex
+  examples can be found on the LodePNG website.
+
+  10.1. decoder C++ example
+  -------------------------
+
+  #include "lodepng.h"
+  #include <iostream>
+
+  int main(int argc, char *argv[])
+  {
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+  }
+
+  10.2. decoder C example
+  -----------------------
+
+  #include "lodepng.h"
+
+  int main(int argc, char *argv[])
+  {
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+  }
+
+  11. state settings reference
+  ----------------------------
+
+  A quick reference of some settings to set on the LodePNGState
+
+  For decoding:
+
+  state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+  state.decoder.zlibsettings.custom_...: use custom inflate function
+  state.decoder.ignore_crc: ignore CRC checksums
+  state.decoder.color_convert: convert internal PNG color to chosen one
+  state.decoder.read_text_chunks: whether to read in text metadata chunks
+  state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+  state.info_raw.colortype: desired color type for decoded image
+  state.info_raw.bitdepth: desired bit depth for decoded image
+  state.info_raw....: more color settings, see struct LodePNGColorMode
+  state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+  For encoding:
+
+  state.encoder.zlibsettings.btype: disable compression by setting it to 0
+  state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+  state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+  state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+  state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+  state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+  state.encoder.zlibsettings.custom_...: use custom deflate function
+  state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+  state.encoder.filter_palette_zero: PNG filter strategy for palette
+  state.encoder.filter_strategy: PNG filter strategy to encode with
+  state.encoder.force_palette: add palette even if not encoding to one
+  state.encoder.add_id: add LodePNG identifier and version as a text chunk
+  state.encoder.text_compression: use compressed text chunks for metadata
+  state.info_raw.colortype: color type of raw input image you provide
+  state.info_raw.bitdepth: bit depth of raw input image you provide
+  state.info_raw: more color settings, see struct LodePNGColorMode
+  state.info_png.color.colortype: desired color type if auto_convert is false
+  state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+  state.info_png.color....: more color settings, see struct LodePNGColorMode
+  state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+  12. changes
+  -----------
+
+  The version number of LodePNG is the date of the change given in the format
+  yyyymmdd.
+
+  Some changes aren't backwards compatible. Those are indicated with a (!)
+  symbol.
+
+  *) 17 sep 2017: fix memory leak for some encoder input error cases
+  *) 27 nov 2016: grey+alpha auto color model detection bugfix
+  *) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+  *) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+  the limits of pure C90).
+  *) 08 dec 2015: Made load_file function return error if file can't be opened.
+  *) 24 okt 2015: Bugfix with decoding to palette output.
+  *) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+  *) 23 aug 2014: Reduced needless memory usage of decoder.
+  *) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+  simplicity. Made ColorProfile public.
+  *) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+  *) 22 dec 2013: Power of two windowsize required for optimization.
+  *) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+  *) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+  *) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+  prefix for the custom allocators and made it possible with a new #define to
+  use custom ones in your project without needing to change lodepng's code.
+  *) 28 jan 2013: Bugfix with color key.
+  *) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+  *) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+  (no palette). Better deflate tree encoding. New compression tweak settings.
+  Faster color conversions while decoding. Some internal cleanups.
+  *) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+  *) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+  and made it work with function pointers instead.
+  *) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+  and free functions and toggle #defines from compiler flags. Small fixes.
+  *) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+  *) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+  redundant C++ codec classes. Reduced amount of structs. Everything changed,
+  but it is cleaner now imho and functionality remains the same. Also fixed
+  several bugs and shrunk the implementation code. Made new samples.
+  *) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+  PNG color model and bit depth, based on the amount and type of colors of the
+  raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+  *) 9 okt 2011: simpler hash chain implementation for the encoder.
+  *) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+  *) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+  A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+  better ones (it's quite significant). A setting to do an experimental, slow,
+  brute force search for PNG filter types is added.
+  *) 17 aug 2011 (!): changed some C zlib related function names.
+  *) 16 aug 2011: made the code less wide (max 120 characters per line).
+  *) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+  *) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+  *) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+  to optimize long sequences of zeros.
+  *) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+  LodePNG_InfoColor_canHaveAlpha functions for convenience.
+  *) 7 nov 2010: added LodePNG_error_text function to get error code description.
+  *) 30 okt 2010: made decoding slightly faster
+  *) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+  Reorganized the documentation and the declaration order in the header.
+  *) 08 aug 2010: only changed some comments and external samples.
+  *) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+  *) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+  *) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+  read by ignoring the problem but windows apps couldn't.
+  *) 06 jun 2008: added more error checks for out of memory cases.
+  *) 26 apr 2008: added a few more checks here and there to ensure more safety.
+  *) 06 mar 2008: crash with encoding of strings fixed
+  *) 02 feb 2008: support for international text chunks added (iTXt)
+  *) 23 jan 2008: small cleanups, and #defines to divide code in sections
+  *) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+  *) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+  *) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+  Also various fixes, such as in the deflate and the padding bits code.
+  *) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+  filtering code of encoder.
+  *) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+  C++ wrapper around this provides an interface almost identical to before.
+  Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+  are together in these files but it works both for C and C++ compilers.
+  *) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+  *) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+  *) 09 aug 2007: some VS2005 warnings removed again
+  *) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+  *) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+  *) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+  invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+  *) 02 jun 2007: made the encoder add a tag with version by default
+  *) 27 may 2007: zlib and png code separated (but still in the same file),
+  simple encoder/decoder functions added for more simple usage cases
+  *) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+  moved some examples from here to lodepng_examples.cpp
+  *) 12 may 2007: palette decoding bug fixed
+  *) 24 apr 2007: changed the license from BSD to the zlib license
+  *) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+  *) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+  palettized PNG images. Plus little interface change with palette and texts.
+  *) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+  Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+  *) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+  and supported by the encoder, resulting in smaller PNGs at the output.
+  *) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+  *) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+  greyscale type to 8-bit greyscale with or without alpha.
+  *) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+  to convert to and is more uniform. See the manual for how it works now.
+  *) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+  encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+  at last made the decoder give errors for incorrect Adler32 or Crc.
+  *) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+  *) 29 dec 2006: Added support for encoding images without alpha channel, and
+  cleaned out code as well as making certain parts faster.
+  *) 28 dec 2006: Added "Settings" to the encoder.
+  *) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+  Removed some code duplication in the decoder. Fixed little bug in an example.
+  *) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+  Fixed a bug of the decoder with 16-bit per color.
+  *) 15 okt 2006: Changed documentation structure
+  *) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+  given image buffer, however for now it's not compressed.
+  *) 08 sep 2006: (!) Changed to interface with a Decoder class
+  *) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+  way. Renamed decodePNG to decodePNGGeneric.
+  *) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+  struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+  *) 28 jul 2006: Cleaned the code and added new error checks.
+  Corrected terminology "deflate" into "inflate".
+  *) 23 jun 2006: Added SDL example in the documentation in the header, this
+  example allows easy debugging by displaying the PNG and its transparency.
+  *) 22 jun 2006: (!) Changed way to obtain error value. Added
+  loadFile function for convenience. Made decodePNG32 faster.
+  *) 21 jun 2006: (!) Changed type of info vector to unsigned.
+  Changed position of palette in info vector. Fixed an important bug that
+  happened on PNGs with an uncompressed block.
+  *) 16 jun 2006: Internally changed unsigned into unsigned where
+  needed, and performed some optimizations.
+  *) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+  in LodePNG namespace. Changed the order of the parameters. Rewrote the
+  documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+  *) 22 apr 2006: Optimized and improved some code
+  *) 07 sep 2005: (!) Changed to std::vector interface
+  *) 12 aug 2005: Initial release (C++, decoder only)
+
+
+  13. contact information
+  -----------------------
+
+  Feel free to contact me with suggestions, problems, comments, ... concerning
+  LodePNG. If you encounter a PNG image that doesn't work properly with this
+  decoder, feel free to send it and I'll use it to find and fix the problem.
+
+  My email address is (puzzle the account and domain together with an @ symbol):
+  Domain: gmail dot com.
+  Account: lode dot vandevenne.
+
+
+  Copyright (c) 2005-2017 Lode Vandevenne
+  */
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/main.cpp b/c_cxx/ort_tutorial/10_ep-device-selection/main.cpp
new file mode 100644
index 000000000..12783597e
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/main.cpp
@@ -0,0 +1,198 @@
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <onnxruntime/core/session/onnxruntime_run_options_config_keys.h>
+#include <onnxruntime/core/session/onnxruntime_session_options_config_keys.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <exception>
+#include <filesystem>
+#include <iostream>
+#include <ostream>
+#include <regex>
+#include <vector>
+
+#include "argparsing.h"
+#include "lodepng/lodepng.h"
+#include "utils.h"
+
+#if ORT_API_VERSION < 23
+#error "Onnx runtime header too old. Version >=1.23.0 assumed"
+#endif
+
+using OrtFileString = std::basic_string<ORTCHAR_T>;
+
+static OrtFileString toOrtFileString(const std::filesystem::path& path) {
+  std::string string(path.string());
+  return {string.begin(), string.end()};
+}
+
+#define PROVIDER_LIB_PAIR(NAME) \
+  std::pair { NAME, DLL_NAME("onnxruntime_providers_" NAME) }
+
+static void register_execution_providers(Ort::Env& env) {
+  // clang-format off
+  std::array provider_libraries{
+      PROVIDER_LIB_PAIR("nv_tensorrt_rtx"),
+      PROVIDER_LIB_PAIR("cuda"),
+      PROVIDER_LIB_PAIR("openvino"),
+      PROVIDER_LIB_PAIR("qnn"),
+      PROVIDER_LIB_PAIR("cann"),
+  };
+  // clang-format on
+
+  for (auto& [registration_name, dll] : provider_libraries) {
+    auto providers_library = get_executable_path().parent_path() / dll;
+    if (!std::filesystem::is_regular_file(providers_library)) {
+      LOG("{} does not exist! Skipping execution provider", providers_library.string());
+      continue;
+    }
+    try {
+      env.RegisterExecutionProviderLibrary(registration_name, toOrtFileString(providers_library));
+    } catch (std::exception& ex) {
+      LOG("Failed to register {}! Skipping execution provider", providers_library.string());
+    }
+  }
+}
+
+Ort::ConstMemoryInfo match_common_memory_info(const Ort::Session& input_session, const Ort::Session& output_session) {
+  auto input_infos = input_session.GetMemoryInfoForOutputs();
+  auto output_infos = output_session.GetMemoryInfoForInputs();
+
+  // First try to find a common non-CPU allocator
+  for (auto& in : input_infos) {
+    for (auto& out : output_infos) {
+      if (in == out && in.GetDeviceType() != OrtMemoryInfoDeviceType_CPU &&
+          in.GetDeviceMemoryType() == OrtDeviceMemoryType_DEFAULT) {
+        return in;
+      }
+    }
+  }
+  // If impossible then also allow to fall back to CPU
+  for (auto& in : input_infos) {
+    for (auto& out : output_infos) {
+      if (in == out) {
+        return in;
+      }
+    }
+  }
+  THROW_ERROR("Could not find a common allocator");
+}
+
+static Ort::SessionOptions create_session_options(Ort::Env& env, const Opts& opts) {
+  std::vector<Ort::ConstEpDevice> selected_devices;
+  auto ep_devices = env.GetEpDevices();
+  LOG("{} devices found", ep_devices.size());
+  for (auto& device : ep_devices) {
+    auto metadata = device.Device().Metadata();
+    // LUID can be used on Windows platform to match EpDevices with
+    // IDXGIAdapter in case an application already has a device selection
+    // logic based on `IDXGIAdapter`s
+    auto luid = metadata.GetValue("LUID");
+    LOG("Vendor: {}, EpName: {}, DeviceId: 0x{:x}, LUID: {}", device.EpVendor(), device.EpName(),
+        device.Device().DeviceId(), luid ? luid : "<unavailable>");
+    if (to_uppercase(opts.select_vendor) == device.Device().Vendor()) {
+      selected_devices.push_back(device);
+    }
+    if (to_uppercase(opts.select_ep) == device.EpName()) {
+      selected_devices.push_back(device);
+    }
+  }
+
+  Ort::SessionOptions so;
+  if (!selected_devices.empty()) {
+    Ort::KeyValuePairs ep_options;
+    // Select EP for manually selected devices
+    so.AppendExecutionProvider_V2(env, selected_devices, ep_options);
+  }
+
+  so.SetEpSelectionPolicy(opts.ep_device_policy);
+  return so;
+}
+
+static Ort::Session create_session(Ort::Env& env, std::filesystem::path& model_file,
+                                   const Ort::SessionOptions& session_options) {
+  if (!std::filesystem::is_regular_file(model_file)) {
+    THROW_ERROR("Model file \"{}\" does not exist!", model_file.string());
+  }
+
+  Ort::Session session(env, toOrtFileString(model_file).c_str(), session_options);
+  return session;
+}
+
+auto main(int argc, char** argv) -> int {
+  try {
+    Opts opts = parse_args(argc, argv);
+
+    auto api = Ort::GetApi();
+    auto version_string = Ort::GetVersionString();
+    auto build_info = api.GetBuildInfoString();
+
+    LOG("Hello from ONNX runtime version: {} (build info {})\n", version_string, build_info);
+
+    // Setup ORT environment
+    auto env = Ort::Env(ORT_LOGGING_LEVEL_WARNING);
+    register_execution_providers(env);
+    // Create session options for ORT environment according to command line
+    // parameters
+    auto session_options = create_session_options(env, opts);
+
+    // Load a ONNX files
+    std::string model_file = MODEL_FILE;
+    auto model_path = get_executable_path().parent_path() / MODEL_FILE;  // defined via CMAKE
+    auto model_context_file = std::regex_replace(model_file, std::regex(".onnx$"), "_ctx.onnx");
+    auto model_context_path = get_executable_path().parent_path() / model_context_file;
+    bool use_model_context = std::filesystem::is_regular_file(model_context_path);
+    auto load_path = use_model_context ? model_context_path : model_path;
+
+    // Prepare inputs
+    uint8_t* image{};
+    DEFER(image, free(image));
+    uint32_t width{};
+    uint32_t height{};
+    auto error = lodepng_decode32_file(&image, &width, &height, opts.input_image.c_str());
+    if (error) {
+      LOG("Failed to load image \"{}\"", opts.input_image);
+      return EXIT_FAILURE;
+    }
+    LOG("Loaded image \"{}\" with size {}x{}", opts.input_image, width, height);
+
+    CHECK_ORT(api.AddFreeDimensionOverrideByName(session_options, "N", 1));
+    CHECK_ORT(api.AddFreeDimensionOverrideByName(session_options, "W", width));
+    CHECK_ORT(api.AddFreeDimensionOverrideByName(session_options, "H", height));
+    if (!use_model_context) {
+      session_options.AddConfigEntry(kOrtSessionOptionEpContextEnable, opts.enableEpContext ? "1" : "0");
+    }
+
+    auto infer_session = create_session(env, load_path, session_options);
+
+    Ort::AllocatorWithDefaultOptions cpu_allocator;
+    std::array input_shape{int64_t(1), int64_t(height), int64_t(width), int64_t(4)};
+    auto output_shape = infer_session.GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
+
+    // This allocates input and output on CPU. This is probably not what you want when doing multiple inferences on a GPU
+    // See the sample (20_devicetensors-datatransfers) how to device memory and device synchronization streams efficiently
+    Ort::Value input_value = Ort::Value::CreateTensor<uint8_t>(cpu_allocator.GetInfo(), image, width * height * 4,
+                                                               input_shape.data(), input_shape.size());
+    Ort::Value output_value =
+        Ort::Value::CreateTensor<uint8_t>(cpu_allocator, output_shape.data(), output_shape.size());
+
+    Ort::IoBinding inference_binding(infer_session);
+    inference_binding.BindInput("input", input_value);
+    inference_binding.BindOutput("depth", output_value);
+
+    Ort::RunOptions run_options;
+    run_options.AddConfigEntry(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "1");
+    infer_session.Run(run_options, inference_binding);
+    inference_binding.SynchronizeOutputs();
+
+    lodepng_encode_file(opts.output_image.c_str(), output_value.GetTensorData<uint8_t>(), output_shape[2],
+                        output_shape[1], LCT_GREY, 8);
+
+  } catch (const std::runtime_error& err) {
+    std::cerr << err.what() << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/utils.cpp b/c_cxx/ort_tutorial/10_ep-device-selection/utils.cpp
new file mode 100644
index 000000000..1988bcc8b
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/utils.cpp
@@ -0,0 +1,50 @@
+#include "utils.h"
+#include <filesystem>
+#include <vector>
+
+#ifdef _WIN32
+#include <windows.h> // For GetModuleFileNameW
+#elif __APPLE__
+#include <limits.h>      // For PATH_MAX or similar
+#include <mach-o/dyld.h> // For _NSGetExecutablePath
+#elif __linux__
+#include <limits.h> // For PATH_MAX
+#include <unistd.h> // For readlink
+#endif
+
+std::filesystem::path get_executable_path() {
+#ifdef _WIN32
+  // Windows: Use GetModuleFileNameW for wide characters
+  std::vector<wchar_t> pathBuf(MAX_PATH);
+  DWORD length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+
+  while (length == pathBuf.size()) {
+    pathBuf.resize(pathBuf.size() * 2);
+    length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+  }
+
+  if (length == 0) {
+    std::cerr << "Error: GetModuleFileNameW failed with error "
+              << GetLastError() << std::endl;
+    return {};
+  }
+  return std::filesystem::path(pathBuf.data());
+
+#elif __APPLE__
+  // macOS: Use _NSGetExecutablePath
+  std::vector<char> pathBuf(PATH_MAX);
+  uint32_t length = pathBuf.size();
+  if (_NSGetExecutablePath(pathBuf.data(), &length) != 0) {
+    // Buffer was too small, resize and try again
+    pathBuf.resize(length + 1); // +1 for null terminator
+    _NSGetExecutablePath(pathBuf.data(), &length);
+  }
+  return std::filesystem::canonical(
+      pathBuf.data()); // canonical to resolve symlinks
+
+#elif __linux__
+  // Linux: Use /proc/self/exe symlink
+  return std::filesystem::canonical(
+      std::filesystem::read_symlink("/proc/self/exe"));
+#endif
+}
diff --git a/c_cxx/ort_tutorial/10_ep-device-selection/utils.h b/c_cxx/ort_tutorial/10_ep-device-selection/utils.h
new file mode 100644
index 000000000..6e3da2727
--- /dev/null
+++ b/c_cxx/ort_tutorial/10_ep-device-selection/utils.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <filesystem>
+#include <format>
+#include <iostream>
+
+#ifndef __cpp_lib_format
+#error                                                                         \
+    "__cpp_lib_format is not defined! This samples requires a C++ 20 compiler"
+#endif
+
+std::filesystem::path get_executable_path();
+
+#define LOG(...) std::cout << std::format(__VA_ARGS__) << "\n"
+#define THROW_ERROR(...)                                                       \
+  LOG(__VA_ARGS__);                                                            \
+  throw std::runtime_error(std::format(__VA_ARGS__));
+
+#define DEFER(resource, x)                                                     \
+  std::shared_ptr<void> resource##_finalizer(nullptr, [&](...) { x; })
+
+#define CHECK_ORT(call)                                                        \
+  {                                                                            \
+    auto status = (call);                                                      \
+    if (status != nullptr) {                                                   \
+      THROW_ERROR("{}", Ort::GetApi().GetErrorMessage(status));                \
+    }                                                                          \
+  }
+
+inline static std::string to_uppercase(const std::string &s) {
+  std::string rtn;
+  rtn.resize(s.size());
+  std::transform(s.begin(), s.end(), rtn.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  return rtn;
+}
+
+#define DLL_NAME(name) (DLL_PREFIX name DLL_SUFFIX)
+#if _WIN32
+#define DLL_PREFIX ""
+#define DLL_SUFFIX ".dll"
+#else
+#define DLL_PREFIX "lib"
+#define DLL_SUFFIX ".so"
+#endif
+#define PROVIDER_DLL_NAME(X) DLL_NAME("onnxruntime_providers_" X))
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/CMakeLists.txt b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/CMakeLists.txt
new file mode 100644
index 000000000..93ce8259c
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/CMakeLists.txt
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.20)
+project(winai-samples)
+
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+include(onnxruntimesetup)
+
+add_executable(devicetensors-datatransfer
+    main.cpp
+    lodepng/lodepng.cpp
+    utils.cpp
+    )
+
+set_target_properties(devicetensors-datatransfer PROPERTIES
+    CXX_STANDARD 20
+    CXX_EXTENSIONS OFF
+    )
+target_link_libraries(devicetensors-datatransfer PRIVATE
+  onnxruntime_interface
+)
+target_include_directories(devicetensors-datatransfer PRIVATE
+    lode_png
+)
+
+set(ONNX "candy.onnx")
+
+copy_file_to_bin_dir(${ONNX})
+copy_file_to_bin_dir(Input.png)
+
+set_target_properties(devicetensors-datatransfer
+    PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    LIBRARY_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    RUNTIME_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+)
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/Input.png b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/Input.png
new file mode 100644
index 000000000..3d64ee0f2
Binary files /dev/null and b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/Input.png differ
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/README.md b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/README.md
new file mode 100644
index 000000000..caaa4b077
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/README.md
@@ -0,0 +1,209 @@
+# EP agnostic IO binding
+
+Binding inputs and outputs directly to device memory greatly reduces the need for repeated data transfers between CPU and GPU. It allows precise control over where tensors are allocated (CPU, GPU, or pinned memory), making it possible to leverage fast device memory for intermediate results and avoid blocking operations. By eliminating unnecessary CPU-GPU communication, IO Binding also helps achieve faster inference.
+
+Efficient IO binding in ONNX Runtime (ORT) has historically required developers to tailor their applications for specific execution providers (EPs) and their relative vendor APIs like CUDA. Only DirectML allowed for vendor agnostic device tensors. This technical deep dive explores emerging practices that enable hardware-agnostic memory allocation and IO binding — simplifying application development and maximizing hardware compatibility.
+
+Previously, creating and preparing tensors for inference in ORT involved code similar to below:
+
+```c
+Ort::SessionOptions opts;
+auto cuda_options =  std::make_unique<OrtCUDAProviderOptions>();
+opts.AppendExecutionProvider_CUDA(*cuda_options);
+Ort::Session session(*ort_env,"model.onnx",opts);
+
+Ort::MemoryInfo mem_info("CUDA", OrtArenaAllocator, 0, OrtMemTypeDefault);
+Ort::Allocator alloc(session, mem_info);
+std::vector<int64_t> shape{3, 4};
+std::vector<float> data(12, 1.0f);
+auto value = Ort::Value::CreateTensor(alloc,shape.data(), shape.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+cudaMemcpy(value.GetTensorMutableData<float>(), data.data(), data.size() * sizeof(float), cudaMemcpyHostToDevice);
+
+```
+
+The workflow as seen the above code forces developers to:
+
+* Know EP details in advance.  
+* Maintain distinct code paths for each hardware target when creating device tensors  
+* Use vendor specific APIs to copy data from CPU to the inference device and back
+
+Such EP dependency increases code complexity, maintenance effort, and limits portability. Since ONNX Runtime 1.23 new cross vendor APIs are present which previously was only possible on windows through DirectX tensors and the DirectML execution provider.
+
+# Hardware-Agnostic Memory Info Creation
+
+The latest ONNX Runtime revision introduces the CreateMemoryInfo\_V2 API, which enables developers to create memory info objects without specifying EP-specific strings:
+
+|  `ORT_API2_STATUS(CreateMemoryInfo_V2,     _In_ const char* name,     _In_ enum OrtMemoryInfoDeviceType device_type,     _In_ uint32_t vendor_id,     _In_ int32_t device_id,     _In_ enum OrtDeviceMemoryType mem_type,     _In_ size_t alignment,     enum OrtAllocatorType allocator_type,     _Outptr_ OrtMemoryInfo** out);`  |
+| :---- |
+
+Key parameters:
+
+* **vendor\_id**: The sole value identifying the hardware vendor (e.g., NVIDIA, AMD). Use 0 for generic devices such as WebGPU.  
+* **name**: Arbitrary string, no longer tied to EP.  
+* **device\_type, device\_id, mem\_type**: Specify the device class and memory characteristics, independent of EP.  
+* **alignment, allocator\_type**: Fine-tune memory allocation specifics.
+
+This design dramatically reduces hardware coupling, allowing single code paths to serve multiple EPs.
+
+# Determining Vendor ID with ONNX Runtime APIs
+
+To keep the creation workflow hardware-agnostic, you can programmatically discover the vendor ID at runtime using the following session API:
+
+|  `// Obtain EP device for the input(s) ortApi.SessionGetEpDeviceForInputs(session, &epDevices, num_epDevices); // Get the associated hardware device const OrtHardwareDevice* hw_device = ortApi.EpDevice_Device(epDevices); // Extract the vendor ID UINT vendorID = ortApi.HardwareDevice_VendorId(hw_device);`  |
+| :---- |
+
+* Register all anticipated EPs at application start.  
+* Set EP selection policy to prefer GPU.  
+* Delegate device selection to ORT, which matches inputs to suitable EPs based on the selection policy set above.  
+* The vendor ID, retrieved at runtime, informs memory info creation—removing the need for hardcoding or prior knowledge of hardware.
+
+This method ensures tensor creation and IO binding remain agnostic to specific IHV implementations.
+
+## Alternative Vendor ID Discovery
+
+Developers can enumerate graphics adapters using the DXCore library instead of ONNX Runtime’s built-in APIs. This approach provides system-wide hardware awareness, especially useful in multi-GPU environments or custom device management scenarios.
+
+Example of EP Agnostic IO Binding in Practice is available at the following link: \<insert link here\>
+
+# Technical Advantages
+
+* **Portability**: Enables deployment across diverse hardware from various IHVs without modifying source code.  
+* **Developer Velocity**: Removes EP-specific barriers, streamlining development and maintenance.  
+* **Hardware Optimization**: Leverages ORT’s automatic EP selection and device management for optimal hardware utilization. With this it is possible to optimize for device specific memory instead of leveraging the common denominator which is usually a CPU tensor.
+
+|  `// Register all supported EPs RegisterAvailableEPs(session); sessionOptions.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_GPU);   // Use SessionGetEpDeviceForInputs to determine the runtime EP device and vendor ID ortApi.SessionGetInputCount(session, &num_epDevices); ortApi.SessionGetEpDeviceForInputs(session, &epDevices, 1); const OrtHardwareDevice* hw_device = ortApi.EpDevice_Device(epDevices); UINT vendorID = ortApi.HardwareDevice_VendorId(hw_device);   // Use CreateMemoryInfo_V2 for hardware-agnostic tensor allocation OrtMemoryInfo* memory_info_agnostic_io = nullptr; ortApi.CreateMemoryInfo_V2("IHV_Agnostic_Memory", OrtMemoryInfoDeviceType_GPU, /*vendor_id*/vID, /*device_id*/0, OrtDeviceMemoryType_DEFAULT, /*default alignment*/0, OrtArenaAllocator, &memory_info_agnostic_io);   // Create input tensors with runtime-discovered memory_info OrtValue* input_tensor = CreateTensor(memory_info_agnostic_io, ...); OrtValue* output_tensor = CreateTensor(memory_info_agnostic_io, ...);   // Bind input/output using regular IO binding routines ioBinding.BindInput(InputTensorName.get(), inputTensor); ioBinding.BindOutput(OuptutTensorName.get(), outputTensor);`  |
+| :---- |
+
+# Efficient CPU–GPU Data Movement and Inference with ONNX Runtime
+
+ONNX Runtime has also introduced an EP-agnostic mechanism for managing data transfers between host (CPU) and device (GPU). This feature allows application developers to upload data onto device memory, run inference, and avoid repeated data transfers — all without having to manage EP-specific complexities.
+
+The latest ONNX Runtime exposes the CopyTensors API, which abstracts away EP-specific implementations of data movement. While backend-specific handling is executed by the respective EP (CUDA, ROCm, DirectML, etc.), developers interact with a single generic API.
+
+This opaque design simplifies application-level development: once an EP is selected, the runtime transparently manages memory copies between CPU and GPU without requiring code changes for different hardware backends.
+
+## **Step 1: Registering Execution Providers**
+
+Before running inference, 
+
+* All available EPs are registered  
+* A GPU-preferred EP selection policy is chosen.   
+* EP configuration options — such as a user-supplied stream (`user_compute_stream`) is assigned to fine-tune behavior and synchronization.
+
+## **Step 2: Creating CPU Input Tensors**
+
+We begin by constructing a CPU tensor to hold the input data:
+
+```c
+Ort::Value input_value = Ort::Value::CreateTensor<float>(
+    cpu_allocator.GetInfo(),
+    input_data.data(), input_data.size(),
+    input_shape.data(), input_shape.size()
+);
+cpu_input_tensors.push_back(std::move(input_value));
+```
+
+This serves as the staging location for host-resident data.
+
+## **Step 3: Allocating Device (GPU) Input Tensors**
+
+To create a device tensor in an EP-agnostic manner, we construct an intermediate `OrtMemoryInfo` object and use it to obtain a shared allocator for device memory:
+
+```c
+OrtMemoryInfo* input_memory_info_agnostic = nullptr;
+ortApi.CreateMemoryInfo_V2(
+    "Input_Agnostic", 
+    OrtMemoryInfoDeviceType_GPU, 
+    /*vendor_id*/ 0x10de, 
+    /*device_id*/ 0, 
+    OrtDeviceMemoryType_DEFAULT, 
+    /*default alignment*/ 0, 
+    OrtArenaAllocator, 
+    &input_memory_info_agnostic
+);
+
+OrtAllocator* allocator = nullptr;
+ortApi.GetSharedAllocator(ortEnvironment, input_memory_info_agnostic, &allocator);
+
+auto src_shape = cpu_input_tensors[idx].GetTensorTypeAndShapeInfo().GetShape();
+Ort::Value device_input_value = Ort::Value::CreateTensor<float>(
+    allocator, src_shape.data(), src_shape.size()
+);
+```
+
+The resulting tensor resides in GPU memory and can now be used directly as an input for inference.
+
+## **Step 4: Direct CPU Inference vs. IOBinding**
+
+When performing inference in a loop, there are two possible approaches:
+
+## **a) Direct CPU Pointer Inference**
+
+It is possible to skip GPU explicit allocation and rely on ONNX Runtime to handle data transfers implicitly.
+
+```c
+input_tensors.push_back(std::move(cpu_input_tensors[idx]));
+output_tensors.push_back(std::move(cpu_output_tensors[idx]));
+for (int i = 0; i < 100; i++) {
+    session.Run(Ort::RunOptions{}, input_names.data(), input_tensors.data(), input_tensors.size(), output_names.data(), output_tensors.data(), output_tensors.size());
+}
+```
+
+In this mode, every iteration performs:
+
+1. Upload of CPU → GPU input data  
+2. Execution of the model  
+3. Download of results back to CPU
+
+Although simpler, this incurs repetitive transfer overhead.
+
+## **b) Optimized Inference with IOBinding**
+
+To avoid redundant transfers, ONNX Runtime allows IOBinding, where data is bound once to device memory and subsequently reused across runs.
+
+```c
+src_tensor_ptrs.push_back(cpu_input_tensors[idx]);
+dst_tensor_ptrs.push_back(device_input_value);
+input_tensors.push_back(std::move(device_input_value));
+
+auto dst_shape = cpu_output_tensors[idx].GetTensorTypeAndShapeInfo().GetShape();
+Ort::Value device_output_value = Ort::Value::CreateTensor<float>(allocator, dst_shape.data(), dst_shape.size());
+output_tensors.push_back(std::move(device_output_value));
+
+ortApi.CopyTensors(ortEnvironment, src_tensor_ptrs.data(), dst_tensor_ptrs.data(), stream, src_tensor_ptrs.size());
+
+Ort::IoBinding iobinding(session);
+iobinding.BindInput(InputTensorName.get(), input_tensors[0]);
+iobinding.BindOutput(OutputTensorName.get(), output_tensors[0]);
+
+for (int i = 0; i < 100; i++) {
+    session.Run(Ort::RunOptions{}, input_names.data(), input_tensors.data(), input_tensors.size(), output_names.data(), output_tensors.data(), output_tensors.size());
+}
+```
+
+With IOBinding:
+
+* Data transfer (CopyTensors) happens only once before the loop.  
+* The input tensor stays resident on device memory, eliminating CPU↔GPU synchronization at every inference step.
+
+This significantly reduces latency in scenarios where repeated inference is performed on the same input size or preprocessed batches.
+
+The following are Nsight traces depicting the performance overhead due to repetitive copies between host to device and device to host in case of no IO binding:
+
+![](image1.png)
+
+Prior to every inference run, copy operation of input from host to device copy takes around 2.1 milliseconds and a device to host copy of output takes around 2.1 milliseconds. For a 100 iteration loop, it adds up to around 420 milliseconds of copy time.
+
+On the other hand, when input and output tensors are IO bound, we can perform host to device copy of input just once prior to the inference loop and device to host copy of output just once after the loop. The following Nsight trace depicts multiple inference runs in the loop without any copy operations in between:  
+![](image2.png)
+
+In this case the one time host to device copy takes 4.2 milliseconds and the one time device to host copy takes 1.3 milliseconds, making the total copy time only 5.5 milliseconds irrespective of number of iterations in the inference loop.
+
+By leveraging CopyTensors \+ IOBinding, ONNX Runtime enables developers to perform EP-agnostic, GPU-accelerated inference with minimal runtime data transfer overhead — leading to improved performance and cleaner code design.
+
+See the next sample [../30_syncstreams_cuda/README.md](../30_syncstreams_cuda/README.md) on how to use explicit synchronization 
+using SyncStreams to optimize data transfers further.
+
+## Dependencies
+
+This sample vendors a copy of https://github.com/lvandeve/lodepng (Zlib license)
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/convert_to_fp16.py b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/convert_to_fp16.py
new file mode 100644
index 000000000..ca2e28aa2
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/convert_to_fp16.py
@@ -0,0 +1,6 @@
+import onnx
+
+from onnxconverter_common import float16
+model = onnx.load("candy.onnx")
+model_fp16 = float16.convert_float_to_float16(model)
+onnx.save(model_fp16, "candy_fp16.onnx")
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/half.hpp b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/half.hpp
new file mode 100644
index 000000000..d0a882dd6
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/half.hpp
@@ -0,0 +1,4601 @@
+// half - IEEE 754-based half-precision floating-point library.
+//
+// Copyright (c) 2012-2021 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 2.2.0
+
+/// \file
+/// Main header file for half-precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+#define HALF_GCC_VERSION (__GNUC__*100+__GNUC_MINOR__)
+
+#if defined(__INTEL_COMPILER)
+	#define HALF_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICC)
+	#define HALF_ICC_VERSION __ICC
+#elif defined(__ICL)
+	#define HALF_ICC_VERSION __ICL
+#else
+	#define HALF_ICC_VERSION 0
+#endif
+
+// check C++11 language features
+#if defined(__clang__)										// clang
+	#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__)		// Intel C++
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif defined(__GNUC__)										// gcc
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+		#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+			#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+		#endif
+		#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+			#define HALF_ENABLE_CPP11_USER_LITERALS 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+			#define HALF_ENABLE_CPP11_CONSTEXPR 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+			#define HALF_ENABLE_CPP11_NOEXCEPT 1
+		#endif
+		#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+			#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+		#endif
+		#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+			#define HALF_ENABLE_CPP11_LONG_LONG 1
+		#endif
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+#elif defined(_MSC_VER)										// Visual C++
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+	#define HALF_POP_WARNINGS 1
+	#pragma warning(push)
+	#pragma warning(disable : 4099 4127 4146)	//struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)								// libc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CSTDINT
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CMATH
+			#define HALF_ENABLE_CPP11_CMATH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_HASH
+			#define HALF_ENABLE_CPP11_HASH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CFENV
+			#define HALF_ENABLE_CPP11_CFENV 1
+		#endif
+	#endif
+#elif defined(__GLIBCXX__)									// libstdc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifdef __clang__
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#else
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#endif
+	#endif
+#elif defined(_CPPLIB_VER)									// Dinkumware/Visual C++
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+		#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
+		#define HALF_ENABLE_CPP11_HASH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
+		#define HALF_ENABLE_CPP11_CMATH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
+		#define HALF_ENABLE_CPP11_CFENV 1
+	#endif
+#endif
+#undef HALF_GCC_VERSION
+#undef HALF_ICC_VERSION
+
+// any error throwing C++ exceptions?
+#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) || defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || defined(HALF_ERRHANDLING_THROW_INEXACT)
+#define HALF_ERRHANDLING_THROWS 1
+#endif
+
+// any error handling enabled?
+#define HALF_ERRHANDLING	(HALF_ERRHANDLING_FLAGS||HALF_ERRHANDLING_ERRNO||HALF_ERRHANDLING_FENV||HALF_ERRHANDLING_THROWS)
+
+#if HALF_ERRHANDLING
+	#define HALF_UNUSED_NOERR(name) name
+#else
+	#define HALF_UNUSED_NOERR(name)
+#endif
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+	#define HALF_CONSTEXPR				constexpr
+	#define HALF_CONSTEXPR_CONST		constexpr
+	#if HALF_ERRHANDLING
+		#define HALF_CONSTEXPR_NOERR
+	#else
+		#define HALF_CONSTEXPR_NOERR	constexpr
+	#endif
+#else
+	#define HALF_CONSTEXPR
+	#define HALF_CONSTEXPR_CONST		const
+	#define HALF_CONSTEXPR_NOERR
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+	#define HALF_NOEXCEPT	noexcept
+	#define HALF_NOTHROW	noexcept
+#else
+	#define HALF_NOEXCEPT
+	#define HALF_NOTHROW	throw()
+#endif
+
+// support thread storage
+#if HALF_ENABLE_CPP11_THREAD_LOCAL
+	#define HALF_THREAD_LOCAL	thread_local
+#else
+	#define HALF_THREAD_LOCAL	static
+#endif
+
+#include <utility>
+#include <algorithm>
+#include <istream>
+#include <ostream>
+#include <limits>
+#include <stdexcept>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+	#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+	#include <cstdint>
+#endif
+#if HALF_ERRHANDLING_ERRNO
+	#include <cerrno>
+#endif
+#if HALF_ENABLE_CPP11_CFENV
+	#include <cfenv>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+	#include <functional>
+#endif
+
+
+#ifndef HALF_ENABLE_F16C_INTRINSICS
+	/// Enable F16C intruction set intrinsics.
+	/// Defining this to 1 enables the use of [F16C compiler intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between 
+	/// half-precision and single-precision values which may result in improved performance. This will not perform additional checks 
+	/// for support of the F16C instruction set, so an appropriate target platform is required when enabling this feature.
+	///
+	/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which some compilers do on supporting platforms.
+	#define HALF_ENABLE_F16C_INTRINSICS __F16C__
+#endif
+#if HALF_ENABLE_F16C_INTRINSICS
+	#include <immintrin.h>
+#endif
+
+#ifdef HALF_DOXYGEN_ONLY
+/// Type for internal floating-point computations.
+/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to override the internal 
+/// half-precision implementation to use this type for computing arithmetic operations and mathematical function (if available). 
+/// This can result in improved performance for arithmetic operators and mathematical functions but might cause results to 
+/// deviate from the specified half-precision rounding mode and inhibits proper detection of half-precision exceptions.
+#define HALF_ARITHMETIC_TYPE (undefined)
+
+/// Enable internal exception flags.
+/// Defining this to 1 causes operations on half-precision values to raise internal floating-point exception flags according to 
+/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
+#define HALF_ERRHANDLING_FLAGS	0
+
+/// Enable exception propagation to `errno`.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to 
+/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will propagate domain errors as 
+/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow errors as 
+/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be propagated.
+#define HALF_ERRHANDLING_ERRNO	0
+
+/// Enable exception propagation to built-in floating-point platform.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to the built-in 
+/// single- and double-precision implementation's exception flags using the 
+/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from `<cfenv>`. However, this 
+/// does not work in reverse and single- or double-precision exceptions will not raise the corresponding half-precision 
+/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
+#define HALF_ERRHANDLING_FENV	0
+
+/// Throw C++ exception on domain errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on domain errors.
+#define HALF_ERRHANDLING_THROW_INVALID		(undefined)
+
+/// Throw C++ exception on pole errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on pole errors.
+#define HALF_ERRHANDLING_THROW_DIVBYZERO	(undefined)
+
+/// Throw C++ exception on overflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified message on overflows.
+#define HALF_ERRHANDLING_THROW_OVERFLOW		(undefined)
+
+/// Throw C++ exception on underflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the specified message on underflows.
+#define HALF_ERRHANDLING_THROW_UNDERFLOW	(undefined)
+
+/// Throw C++ exception on rounding errors.
+/// Defining this to 1 causes operations on half-precision values to throw a 
+/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified message on general rounding errors.
+#define HALF_ERRHANDLING_THROW_INEXACT		(undefined)
+#endif
+
+#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+/// Raise INEXACT exception on overflow.
+/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT	1
+#endif
+
+#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+/// Raise INEXACT exception on underflow.
+/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+///
+/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be raised *only* when the result 
+/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact) subnormal result.
+#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT	1
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and more precise types 
+/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic operations and mathematical 
+/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes using their respective 
+/// constants or the equivalent values of 
+/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest (default)
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest representable value. It can even 
+/// be set to [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style) to synchronize 
+/// the rounding mode with that of the built-in single-precision implementation (which is likely `std::round_to_nearest`, though).
+#ifndef HALF_ROUND_STYLE
+	#define HALF_ROUND_STYLE	1		// = std::round_to_nearest
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an 
+/// operation, in particular it just evaluates to positive infinity.
+///
+/// **See also:** Documentation for [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
+#define HUGE_VALH	std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a separate 
+/// half-precision multiplication followed by an addition, which is always the case.
+///
+/// **See also:** Documentation for [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
+#define FP_FAST_FMAH	1
+
+///	Half rounding mode.
+/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode used for 
+/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
+///
+/// **See also:** Documentation for [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
+#define HLF_ROUNDS	HALF_ROUND_STYLE
+
+#ifndef FP_ILOGB0
+	#define FP_ILOGB0		INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+	#define FP_ILOGBNAN		INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+	#define FP_SUBNORMAL	0
+#endif
+#ifndef FP_ZERO
+	#define FP_ZERO			1
+#endif
+#ifndef FP_NAN
+	#define FP_NAN			2
+#endif
+#ifndef FP_INFINITE
+	#define FP_INFINITE		3
+#endif
+#ifndef FP_NORMAL
+	#define FP_NORMAL		4
+#endif
+
+#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
+	#define FE_INVALID		0x10
+	#define FE_DIVBYZERO	0x08
+	#define FE_OVERFLOW		0x04
+	#define FE_UNDERFLOW	0x02
+	#define FE_INEXACT		0x01
+	#define FE_ALL_EXCEPT	(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT)
+#endif
+
+
+/// Main namespace for half-precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+	class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	/// Library-defined half-precision literals.
+	/// Import this namespace to enable half-precision floating-point literals:
+	/// ~~~~{.cpp}
+	/// using namespace half_float::literal;
+	/// half_float::half = 4.2_h;
+	/// ~~~~
+	namespace literal
+	{
+		half operator "" _h(long double);
+	}
+#endif
+
+	/// \internal
+	/// \brief Implementation details.
+	namespace detail
+	{
+	#if HALF_ENABLE_CPP11_TYPE_TRAITS
+		/// Conditional type.
+		template<bool B,typename T,typename F> struct conditional : std::conditional<B,T,F> {};
+
+		/// Helper for tag dispatching.
+		template<bool B> struct bool_type : std::integral_constant<bool,B> {};
+		using std::true_type;
+		using std::false_type;
+
+		/// Type traits for floating-point types.
+		template<typename T> struct is_float : std::is_floating_point<T> {};
+	#else
+		/// Conditional type.
+		template<bool,typename T,typename> struct conditional { typedef T type; };
+		template<typename T,typename F> struct conditional<false,T,F> { typedef F type; };
+
+		/// Helper for tag dispatching.
+		template<bool> struct bool_type {};
+		typedef bool_type<true> true_type;
+		typedef bool_type<false> false_type;
+
+		/// Type traits for floating-point types.
+		template<typename> struct is_float : false_type {};
+		template<typename T> struct is_float<const T> : is_float<T> {};
+		template<typename T> struct is_float<volatile T> : is_float<T> {};
+		template<typename T> struct is_float<const volatile T> : is_float<T> {};
+		template<> struct is_float<float> : true_type {};
+		template<> struct is_float<double> : true_type {};
+		template<> struct is_float<long double> : true_type {};
+	#endif
+
+		/// Type traits for floating-point bits.
+		template<typename T> struct bits { typedef unsigned char type; };
+		template<typename T> struct bits<const T> : bits<T> {};
+		template<typename T> struct bits<volatile T> : bits<T> {};
+		template<typename T> struct bits<const volatile T> : bits<T> {};
+
+	#if HALF_ENABLE_CPP11_CSTDINT
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef std::uint_least16_t uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef std::uint_fast32_t uint32;
+
+		/// Fastest signed integer of (at least) 32 bits width.
+		typedef std::int_fast32_t int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> { typedef std::uint_least32_t type; };
+
+		/// Unsigned integer of (at least) 64 bits width.
+		template<> struct bits<double> { typedef std::uint_least64_t type; };
+	#else
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef unsigned short uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef unsigned long uint32;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef long int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits>=32,unsigned int,unsigned long> {};
+
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits>=64,unsigned long,unsigned long long> {};
+		#else
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> { typedef unsigned long type; };
+		#endif
+	#endif
+
+	#ifdef HALF_ARITHMETIC_TYPE
+		/// Type to use for arithmetic computations and mathematic functions internally.
+		typedef HALF_ARITHMETIC_TYPE internal_t;
+	#endif
+
+		/// Tag type for binary construction.
+		struct binary_t {};
+
+		/// Tag for binary construction.
+		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+		/// \name Implementation defined classification and arithmetic
+		/// \{
+
+		/// Check for infinity.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if infinity
+		/// \retval false else
+		template<typename T> bool builtin_isinf(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isinf(arg);
+		#elif defined(_MSC_VER)
+			return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+		#else
+			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+		#endif
+		}
+
+		/// Check for NaN.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if not a number
+		/// \retval false else
+		template<typename T> bool builtin_isnan(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isnan(arg);
+		#elif defined(_MSC_VER)
+			return ::_isnan(static_cast<double>(arg)) != 0;
+		#else
+			return arg != arg;
+		#endif
+		}
+
+		/// Check sign.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if signbit set
+		/// \retval false else
+		template<typename T> bool builtin_signbit(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::signbit(arg);
+		#else
+			return arg < T() || (arg == T() && T(1)/arg < T());
+		#endif
+		}
+
+		/// Platform-independent sign mask.
+		/// \param arg integer value in two's complement
+		/// \retval -1 if \a arg negative
+		/// \retval 0 if \a arg positive
+		inline uint32 sign_mask(uint32 arg)
+		{
+			static const int N = std::numeric_limits<uint32>::digits - 1;
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> N;
+		#else
+			return -((arg>>N)&1);
+		#endif
+		}
+
+		/// Platform-independent arithmetic right shift.
+		/// \param arg integer value in two's complement
+		/// \param i shift amount (at most 31)
+		/// \return \a arg right shifted for \a i bits with possible sign extension
+		inline uint32 arithmetic_shift(uint32 arg, int i)
+		{
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> i;
+		#else
+			return static_cast<int32>(arg)/(static_cast<int32>(1)<<i) - ((arg>>(std::numeric_limits<uint32>::digits-1))&1);
+		#endif
+		}
+
+		/// \}
+		/// \name Error handling
+		/// \{
+
+		/// Internal exception flags.
+		/// \return reference to global exception flags
+		inline int& errflags() { HALF_THREAD_LOCAL int flags = 0; return flags; }
+
+		/// Raise floating-point exception.
+		/// \param flags exceptions to raise
+		/// \param cond condition to raise exceptions for
+		inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
+		{
+		#if HALF_ERRHANDLING
+			if(!cond)
+				return;
+		#if HALF_ERRHANDLING_FLAGS
+			errflags() |= flags;
+		#endif
+		#if HALF_ERRHANDLING_ERRNO
+			if(flags & FE_INVALID)
+				errno = EDOM;
+			else if(flags & (FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW))
+				errno = ERANGE;
+		#endif
+		#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
+			std::feraiseexcept(flags);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INVALID
+			if(flags & FE_INVALID)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
+			if(flags & FE_DIVBYZERO)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
+			if(flags & FE_OVERFLOW)
+				throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
+			if(flags & FE_UNDERFLOW)
+				throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INEXACT
+			if(flags & FE_INEXACT)
+				throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
+				raise(FE_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+			if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
+				raise(FE_INEXACT);
+		#endif
+		#endif
+		}
+
+		/// Check and signal for any NaN.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \retval true if either \a x or \a y is NaN
+		/// \retval false else
+		/// \exception FE_INVALID if \a x or \a y is NaN
+		inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, (x&0x7FFF)>0x7C00 || (y&0x7FFF)>0x7C00);
+		#endif
+			return (x&0x7FFF) > 0x7C00 || (y&0x7FFF) > 0x7C00;
+		}
+
+		/// Signal and silence signaling NaN.
+		/// \param nan half-precision NaN value
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a nan is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, !(nan&0x200));
+		#endif
+			return nan | 0x200;
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x or \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : (y|0x200);
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \param z third half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)) || ((z&0x7FFF)>0x7C00 && !(z&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : ((y&0x7FFF)>0x7C00) ? (y|0x200) : (z|0x200);
+		}
+
+		/// Select value or signaling NaN.
+		/// \param x preferred half-precision value
+		/// \param y ignored half-precision value except for signaling NaN
+		/// \return \a y if signaling NaN, \a x otherwise
+		/// \exception FE_INVALID if \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
+		{
+		#if HALF_ERRHANDLING
+			return (((y&0x7FFF)>0x7C00) && !(y&0x200)) ? signal(y) : x;
+		#else
+			return x;
+		#endif
+		}
+
+		/// Raise domain error and return NaN.
+		/// return quiet NaN
+		/// \exception FE_INVALID
+		inline HALF_CONSTEXPR_NOERR unsigned int invalid()
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID);
+		#endif
+			return 0x7FFF;
+		}
+
+		/// Raise pole error and return infinity.
+		/// \param sign half-precision value with sign bit only
+		/// \return half-precision infinity with sign of \a sign
+		/// \exception FE_DIVBYZERO
+		inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_DIVBYZERO);
+		#endif
+			return sign | 0x7C00;
+		}
+
+		/// Check value for underflow.
+		/// \param arg non-zero half-precision value to check
+		/// \return \a arg
+		/// \exception FE_UNDERFLOW if arg is subnormal
+		inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
+		{
+		#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			raise(FE_UNDERFLOW, !(arg&0x7C00));
+		#endif
+			return arg;
+		}
+
+		/// \}
+		/// \name Conversion and rounding
+		/// \{
+
+		/// Half-precision overflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded overflowing half-precision value
+		/// \exception FE_OVERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_OVERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+0x7C00-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+0x7BFF+(sign>>15)) :
+					(R==std::round_toward_zero) ? (sign|0x7BFF) :
+					(sign|0x7C00);
+		}
+
+		/// Half-precision underflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded underflowing half-precision value
+		/// \exception FE_UNDERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_UNDERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+1-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+(sign>>15)) :
+					sign;
+		}
+
+		/// Round half-precision number.
+		/// \tparam R rounding mode to use
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param value finite half-precision number to round
+		/// \param g guard bit (most significant discarded bit)
+		/// \param s sticky bit (or of all but the most significant discarded bits)
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,bool I> HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
+		{
+		#if HALF_ERRHANDLING
+			value +=	(R==std::round_to_nearest) ? (g&(s|value)) :
+						(R==std::round_toward_infinity) ? (~(value>>15)&(g|s)) :
+						(R==std::round_toward_neg_infinity) ? ((value>>15)&(g|s)) : 0;
+			if((value&0x7C00) == 0x7C00)
+				raise(FE_OVERFLOW);
+			else if(value & 0x7C00)
+				raise(FE_INEXACT, I || (g|s)!=0);
+			else
+				raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g|s)!=0);
+			return value;
+		#else
+			return	(R==std::round_to_nearest) ? (value+(g&(s|value))) :
+					(R==std::round_toward_infinity) ? (value+(~(value>>15)&(g|s))) :
+					(R==std::round_toward_neg_infinity) ? (value+((value>>15)&(g|s))) :
+					value;
+		#endif
+		}
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \param value half-precision value to round
+		/// \return half-precision bits for nearest integral value
+		/// \exception FE_INVALID for signaling NaN
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I> unsigned int integral(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs < 0x3C00)
+			{
+				raise(FE_INEXACT, I);
+				return ((R==std::round_to_nearest) ? (0x3C00&-static_cast<unsigned>(abs>=(0x3800+E))) :
+						(R==std::round_toward_infinity) ? (0x3C00&-(~(value>>15)&(abs!=0))) :
+						(R==std::round_toward_neg_infinity) ? (0x3C00&-static_cast<unsigned>(value>0x8000)) :
+						0) | (value&0x8000);
+			}
+			if(abs >= 0x6400)
+				return (abs>0x7C00) ? signal(value) : value;
+			unsigned int exp = 25 - (abs>>10), mask = (1<<exp) - 1;
+			raise(FE_INEXACT, I && (value&mask));
+			return ((	(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(value>>exp)&E)) :
+						(R==std::round_toward_infinity) ? (mask&((value>>15)-1)) :
+						(R==std::round_toward_neg_infinity) ? (mask&-(value>>15)) :
+						0) + value) & ~mask;
+		}
+
+		/// Convert fixed point to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam F number of fractional bits in [11,31]
+		/// \tparam S `true` for signed, `false` for unsigned
+		/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param m mantissa in Q1.F fixed point format
+		/// \param exp biased exponent - 1
+		/// \param sign half-precision value with sign bit only
+		/// \param s sticky bit (or of all but the most significant already discarded bits)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,unsigned int F,bool S,bool N,bool I> unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
+		{
+			if(S)
+			{
+				uint32 msign = sign_mask(m);
+				m = (m^msign) - msign;
+				sign = msign & 0x8000;
+			}
+			if(N)
+				for(; m<(static_cast<uint32>(1)<<F) && exp; m<<=1,--exp) ;
+			else if(exp < 0)
+				return rounded<R,I>(sign+(m>>(F-10-exp)), (m>>(F-11-exp))&1, s|((m&((static_cast<uint32>(1)<<(F-11-exp))-1))!=0));
+			return rounded<R,I>(sign+(exp<<10)+(m>>(F-10)), (m>>(F-11))&1, s|((m&((static_cast<uint32>(1)<<(F-11))-1))!=0));
+		}
+
+		/// Convert IEEE single-precision to half-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \tparam R rounding mode to use
+		/// \param value single-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(float value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
+				(R==std::round_to_nearest) ? _MM_FROUND_TO_NEAREST_INT :
+				(R==std::round_toward_zero) ? _MM_FROUND_TO_ZERO :
+				(R==std::round_toward_infinity) ? _MM_FROUND_TO_POS_INF :
+				(R==std::round_toward_neg_infinity) ? _MM_FROUND_TO_NEG_INF :
+				_MM_FROUND_CUR_DIRECTION));
+		#else
+			bits<float>::type fbits;
+			std::memcpy(&fbits, &value, sizeof(float));
+		#if 1
+			unsigned int sign = (fbits>>16) & 0x8000;
+			fbits &= 0x7FFFFFFF;
+			if(fbits >= 0x7F800000)
+				return sign | 0x7C00 | ((fbits>0x7F800000) ? (0x200|((fbits>>13)&0x3FF)) : 0);
+			if(fbits >= 0x47800000)
+				return overflow<R>(sign);
+			if(fbits >= 0x38800000)
+				return rounded<R,false>(sign|(((fbits>>23)-112)<<10)|((fbits>>13)&0x3FF), (fbits>>12)&1, (fbits&0xFFF)!=0);
+			if(fbits >= 0x33000000)
+			{
+				int i = 125 - (fbits>>23);
+				fbits = (fbits&0x7FFFFF) | 0x800000;
+				return rounded<R,false>(sign|(fbits>>(i+1)), (fbits>>i)&1, (fbits&((static_cast<uint32>(1)<<i)-1))!=0);
+			}
+			if(fbits != 0)
+				return underflow<R>(sign);
+			return sign;
+		#else
+			static const uint16 base_table[512] = {
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 
+				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 
+				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7C00, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 
+				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 
+				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00 };
+			static const unsigned char shift_table[256] = {
+				24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+			int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
+			fbits &= 0x7FFFFF;
+			uint32 m = (fbits|((exp!=0)<<23)) & -static_cast<uint32>(exp!=0xFF);
+			return rounded<R,false>(base_table[sexp]+(fbits>>i), (m>>(i-1))&1, (((static_cast<uint32>(1)<<(i-1))-1)&m)!=0);
+		#endif
+		#endif
+		}
+
+		/// Convert IEEE double-precision to half-precision.
+		/// \tparam R rounding mode to use
+		/// \param value double-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(double value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			if(R == std::round_indeterminate)
+				return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
+		#endif
+			bits<double>::type dbits;
+			std::memcpy(&dbits, &value, sizeof(double));
+			uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
+			unsigned int sign = (hi>>16) & 0x8000;
+			hi &= 0x7FFFFFFF;
+			if(hi >= 0x7FF00000)
+				return sign | 0x7C00 | ((dbits&0xFFFFFFFFFFFFF) ? (0x200|((hi>>10)&0x3FF)) : 0);
+			if(hi >= 0x40F00000)
+				return overflow<R>(sign);
+			if(hi >= 0x3F100000)
+				return rounded<R,false>(sign|(((hi>>20)-1008)<<10)|((hi>>10)&0x3FF), (hi>>9)&1, ((hi&0x1FF)|lo)!=0);
+			if(hi >= 0x3E600000)
+			{
+				int i = 1018 - (hi>>20);
+				hi = (hi&0xFFFFF) | 0x100000;
+				return rounded<R,false>(sign|(hi>>(i+1)), (hi>>i)&1, ((hi&((static_cast<uint32>(1)<<i)-1))|lo)!=0);
+			}
+			if((hi|lo) != 0)
+				return underflow<R>(sign);
+			return sign;
+		}
+
+		/// Convert non-IEEE floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half_impl(T value, ...)
+		{
+			unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+			if(value == T())
+				return hbits;
+			if(builtin_isnan(value))
+				return hbits | 0x7FFF;
+			if(builtin_isinf(value))
+				return hbits | 0x7C00;
+			int exp;
+			std::frexp(value, &exp);
+			if(exp > 16)
+				return overflow<R>(hbits);
+			if(exp < -13)
+				value = std::ldexp(value, 25);
+			else
+			{
+				value = std::ldexp(value, 12-exp);
+				hbits |= ((exp+13)<<10);
+			}
+			T ival, frac = std::modf(value, &ival);
+			int m = std::abs(static_cast<int>(ival));
+			return rounded<R,false>(hbits+(m>>1), m&1, frac!=T());
+		}
+
+		/// Convert floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half(T value)
+		{
+			return float2half_impl<R>(value, bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert integer to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value integral value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int int2half(T value)
+		{
+			unsigned int bits = static_cast<unsigned>(value<0) << 15;
+			if(!value)
+				return bits;
+			if(bits)
+				value = -value;
+			if(value > 0xFFFF)
+				return overflow<R>(bits);
+			unsigned int m = static_cast<unsigned int>(value), exp = 24;
+			for(; m<0x400; m<<=1,--exp) ;
+			for(; m>0x7FF; m>>=1,++exp) ;
+			bits |= (exp<<10) + m;
+			return (exp>24) ? rounded<R,false>(bits, (value>>(exp-25))&1, (((1<<(exp-25))-1)&value)!=0) : bits;
+		}
+
+		/// Convert half-precision to IEEE single-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \param value half-precision value to convert
+		/// \return single-precision value
+		inline float half2float_impl(unsigned int value, float, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
+		#else
+		#if 0
+			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
+				fbits += static_cast<bits<float>::type>(abs) << 13;
+			}
+		#else
+			static const bits<float>::type mantissa_table[2048] = {
+				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 
+				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 
+				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 
+				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 
+				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 
+				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 
+				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 
+				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 
+				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 
+				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 
+				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 
+				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 
+				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 
+				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 
+				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 
+				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 
+				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 
+				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 
+				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 
+				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 
+				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 
+				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 
+				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 
+				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 
+				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 
+				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 
+				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 
+				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 
+				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 
+				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 
+				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 
+				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 
+				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 
+				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 
+				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 
+				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 
+				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 
+				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 
+				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 
+				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 
+				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 
+				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 
+				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 
+				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 
+				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 
+				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 
+				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 
+				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 
+				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 
+				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 
+				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 
+				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 
+				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 
+				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 
+				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 
+				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 
+				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 
+				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 
+				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 
+				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 
+				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 
+				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 
+				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 
+				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 
+				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 
+				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 
+				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 
+				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 
+				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 
+				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 
+				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 
+				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 
+				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 
+				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 
+				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 
+				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 
+				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 
+				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 
+				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 
+				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 
+				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 
+				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 
+				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 
+				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 
+				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 
+				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 
+				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 
+				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 
+				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 
+				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 
+				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 
+				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 
+				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 
+				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 
+				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 
+				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 
+				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 
+				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 
+				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 
+				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 
+				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 
+				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 
+				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 
+				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 
+				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 
+				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 
+				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 
+				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 
+				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 
+				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 
+				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 
+				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 
+				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 
+				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 
+				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 
+				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 
+				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 
+				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 
+				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 
+				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 
+				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 
+				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 
+				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 
+				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 
+				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 
+				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 
+				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 
+				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+			static const bits<float>::type exponent_table[64] = {
+				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 
+				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 
+				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 
+				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+			static const unsigned short offset_table[64] = {
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+			bits<float>::type fbits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
+		#endif
+			float out;
+			std::memcpy(&out, &fbits, sizeof(float));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to IEEE double-precision.
+		/// \param value half-precision value to convert
+		/// \return double-precision value
+		inline double half2float_impl(unsigned int value, double, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
+		#else
+			uint32 hi = static_cast<uint32>(value&0x8000) << 16;
+			unsigned int abs = value & 0x7FFF;
+			if(abs)
+			{
+				hi |= 0x3F000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,hi-=0x100000) ;
+				hi += static_cast<uint32>(abs) << 10;
+			}
+			bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
+			double out;
+			std::memcpy(&out, &dbits, sizeof(double));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to non-IEEE floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float_impl(unsigned int value, T, ...)
+		{
+			T out;
+			unsigned int abs = value & 0x7FFF;
+			if(abs > 0x7C00)
+				out = (std::numeric_limits<T>::has_signaling_NaN && !(abs&0x200)) ? std::numeric_limits<T>::signaling_NaN() :
+					std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+			else if(abs == 0x7C00)
+				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+			else if(abs > 0x3FF)
+				out = std::ldexp(static_cast<T>((abs&0x3FF)|0x400), (abs>>10)-25);
+			else
+				out = std::ldexp(static_cast<T>(abs), -24);
+			return (value&0x8000) ? -out : out;
+		}
+
+		/// Convert half-precision to floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float(unsigned int value)
+		{
+			return half2float_impl(value, T(), bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert half-precision floating-point to integer.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value half-precision value to convert
+		/// \return rounded integer value
+		/// \exception FE_INVALID if value is not representable in type \a T
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I,typename T> T half2int(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs >= 0x7C00)
+			{
+				raise(FE_INVALID);
+				return (value&0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+			}
+			if(abs < 0x3800)
+			{
+				raise(FE_INEXACT, I);
+				return	(R==std::round_toward_infinity) ? T(~(value>>15)&(abs!=0)) :
+						(R==std::round_toward_neg_infinity) ? -T(value>0x8000) :
+						T();
+			}
+			int exp = 25 - (abs>>10);
+			unsigned int m = (value&0x3FF) | 0x400;
+			int32 i = static_cast<int32>((exp<=0) ? (m<<-exp) : ((m+(
+				(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(m>>exp)&E)) :
+				(R==std::round_toward_infinity) ? (((1<<exp)-1)&((value>>15)-1)) :
+				(R==std::round_toward_neg_infinity) ? (((1<<exp)-1)&-(value>>15)) : 0))>>exp));
+			if((!std::numeric_limits<T>::is_signed && (value&0x8000)) || (std::numeric_limits<T>::digits<16 &&
+				((value&0x8000) ? (-i<std::numeric_limits<T>::min()) : (i>std::numeric_limits<T>::max()))))
+				raise(FE_INVALID);
+			else if(I && exp > 0 && (m&((1<<exp)-1)))
+				raise(FE_INEXACT);
+			return static_cast<T>((value&0x8000) ? -i : i);
+		}
+
+		/// \}
+		/// \name Mathematics
+		/// \{
+
+		/// upper part of 64-bit multiplication.
+		/// \tparam R rounding mode to use
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y
+		template<std::float_round_style R> uint32 mulhi(uint32 x, uint32 y)
+		{
+			uint32 xy = (x>>16) * (y&0xFFFF), yx = (x&0xFFFF) * (y>>16), c = (xy&0xFFFF) + (yx&0xFFFF) + (((x&0xFFFF)*(y&0xFFFF))>>16);
+			return (x>>16)*(y>>16) + (xy>>16) + (yx>>16) + (c>>16) +
+				((R==std::round_to_nearest) ? ((c>>15)&1) : (R==std::round_toward_infinity) ? ((c&0xFFFF)!=0) : 0);
+		}
+
+		/// 64-bit multiplication.
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y rounded to nearest
+		inline uint32 multiply64(uint32 x, uint32 y)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			return static_cast<uint32>((static_cast<unsigned long long>(x)*static_cast<unsigned long long>(y)+0x80000000)>>32);
+		#else
+			return mulhi<std::round_to_nearest>(x, y);
+		#endif
+		}
+
+		/// 64-bit division.
+		/// \param x upper 32 bit of dividend
+		/// \param y divisor
+		/// \param s variable to store sticky bit for rounding
+		/// \return (\a x << 32) / \a y
+		inline uint32 divide64(uint32 x, uint32 y, int &s)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long xx = static_cast<unsigned long long>(x) << 32;
+			return s = (xx%y!=0), static_cast<uint32>(xx/y);
+		#else
+			y >>= 1;
+			uint32 rem = x, div = 0;
+			for(unsigned int i=0; i<32; ++i)
+			{
+				div <<= 1;
+				if(rem >= y)
+				{
+					rem -= y;
+					div |= 1;
+				}
+				rem <<= 1;
+			}
+			return s = rem > 1, div;
+		#endif
+		}
+
+		/// Half precision positive modulus.
+		/// \tparam Q `true` to compute full quotient, `false` else
+		/// \tparam R `true` to compute signed remainder, `false` for positive remainder
+		/// \param x first operand as positive finite half-precision value
+		/// \param y second operand as positive finite half-precision value
+		/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+		/// \return modulus of \a x / \a y
+		template<bool Q,bool R> unsigned int mod(unsigned int x, unsigned int y, int *quo = NULL)
+		{
+			unsigned int q = 0;
+			if(x > y)
+			{
+				int absx = x, absy = y, expx = 0, expy = 0;
+				for(; absx<0x400; absx<<=1,--expx) ;
+				for(; absy<0x400; absy<<=1,--expy) ;
+				expx += absx >> 10;
+				expy += absy >> 10;
+				int mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+				for(int d=expx-expy; d; --d)
+				{
+					if(!Q && mx == my)
+						return 0;
+					if(mx >= my)
+					{
+						mx -= my;
+						q += Q;
+					}
+					mx <<= 1;
+					q <<= static_cast<int>(Q);
+				}
+				if(!Q && mx == my)
+					return 0;
+				if(mx >= my)
+				{
+					mx -= my;
+					++q;
+				}
+				if(Q)
+				{
+					q &= (1<<(std::numeric_limits<int>::digits-1)) - 1;
+					if(!mx)
+						return *quo = q, 0;
+				}
+				for(; mx<0x400; mx<<=1,--expy) ;
+				x = (expy>0) ? ((expy<<10)|(mx&0x3FF)) : (mx>>(1-expy));
+			}
+			if(R)
+			{
+				unsigned int a, b;
+				if(y < 0x800)
+				{
+					a = (x<0x400) ? (x<<1) : (x+0x400);
+					b = y;
+				}
+				else
+				{
+					a = x;
+					b = y - 0x400;
+				}
+				if(a > b || (a == b && (q&1)))
+				{
+					int exp = (y>>10) + (y<=0x3FF), d = exp - (x>>10) - (x<=0x3FF);
+					int m = (((y&0x3FF)|((y>0x3FF)<<10))<<1) - (((x&0x3FF)|((x>0x3FF)<<10))<<(1-d));
+					for(; m<0x800 && exp>1; m<<=1,--exp) ;
+					x = 0x8000 + ((exp-1)<<10) + (m>>1);
+					q += Q;
+				}
+			}
+			if(Q)
+				*quo = q;
+			return x;
+		}
+
+		/// Fixed point square root.
+		/// \tparam F number of fractional bits
+		/// \param r radicand in Q1.F fixed point format
+		/// \param exp exponent
+		/// \return square root as Q1.F/2
+		template<unsigned int F> uint32 sqrt(uint32 &r, int &exp)
+		{
+			int i = exp & 1;
+			r <<= i;
+			exp = (exp-i) / 2;
+			uint32 m = 0;
+			for(uint32 bit=static_cast<uint32>(1)<<F; bit; bit>>=2)
+			{
+				if(r < m+bit)
+					m >>= 1;
+				else
+				{
+					r -= m + bit;
+					m = (m>>1) + bit;
+				}
+			}
+			return m;
+		}
+
+		/// Fixed point binary exponential.
+		/// This uses the BKM algorithm in E-mode.
+		/// \param m exponent in [0,1) as Q0.31
+		/// \param n number of iterations (at most 32)
+		/// \return 2 ^ \a m as Q1.31
+		inline uint32 exp2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(!m)
+				return 0x80000000;
+			uint32 mx = 0x80000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = my + logs[i];
+				if(mz <= m)
+				{
+					my = mz;
+					mx += mx >> i;
+				}
+			}
+			return mx;
+		}
+
+		/// Fixed point binary logarithm.
+		/// This uses the BKM algorithm in L-mode.
+		/// \param m mantissa in [1,2) as Q1.30
+		/// \param n number of iterations (at most 32)
+		/// \return log2(\a m) as Q0.31
+		inline uint32 log2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(m == 0x40000000)
+				return 0;
+			uint32 mx = 0x40000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = mx + (mx>>i);
+				if(mz <= m)
+				{
+					mx = mz;
+					my += logs[i];
+				}
+			}
+			return my;
+		}
+
+		/// Fixed point sine and cosine.
+		/// This uses the CORDIC algorithm in rotation mode.
+		/// \param mz angle in [-pi/2,pi/2] as Q1.30
+		/// \param n number of iterations (at most 31)
+		/// \return sine and cosine of \a mz as Q1.30
+		inline std::pair<uint32,uint32> sincos(uint32 mz, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mx = 0x26DD3B6A, my = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(mz);
+				uint32 tx = mx - (arithmetic_shift(my, i)^sign) + sign;
+				uint32 ty = my + (arithmetic_shift(mx, i)^sign) - sign;
+				mx = tx; my = ty; mz -= (angles[i]^sign) - sign;
+			}
+			return std::make_pair(my, mx);
+		}
+
+		/// Fixed point arc tangent.
+		/// This uses the CORDIC algorithm in vectoring mode.
+		/// \param my y coordinate as Q0.30
+		/// \param mx x coordinate as Q0.30
+		/// \param n number of iterations (at most 31)
+		/// \return arc tangent of \a my / \a mx as Q1.30
+		inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mz = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(my);
+				uint32 tx = mx + (arithmetic_shift(my, i)^sign) - sign;
+				uint32 ty = my - (arithmetic_shift(mx, i)^sign) + sign;
+				mx = tx; my = ty; mz += (angles[i]^sign) - sign;
+			}
+			return mz;
+		}
+
+		/// Reduce argument for trigonometric functions.
+		/// \param abs half-precision floating-point value
+		/// \param k value to take quarter period
+		/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
+		inline uint32 angle_arg(unsigned int abs, int &k)
+		{
+			uint32 m = (abs&0x3FF) | ((abs>0x3FF)<<10);
+			int exp = (abs>>10) + (abs<=0x3FF) - 15;
+			if(abs < 0x3A48)
+				return k = 0, m << (exp+20);
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL<<(62-exp)) - 1, yi = (y+(mask>>1)) & ~mask, f = y - yi;
+			uint32 sign = -static_cast<uint32>(f>>63);
+			k = static_cast<int>(yi>>(62-exp));
+			return (multiply64(static_cast<uint32>((sign ? -f : f)>>(31-exp)), 0xC90FDAA2)^sign) - sign;
+		#else
+			uint32 yh = m*0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442), yl = (m*0x36E4E442) & 0xFFFFFFFF;
+			uint32 mask = (static_cast<uint32>(1)<<(30-exp)) - 1, yi = (yh+(mask>>1)) & ~mask, sign = -static_cast<uint32>(yi>yh);
+			k = static_cast<int>(yi>>(30-exp));
+			uint32 fh = (yh^sign) + (yi^~sign) - ~sign, fl = (yl^sign) - sign;
+			return (multiply64((exp>-1) ? (((fh<<(1+exp))&0xFFFFFFFF)|((fl&0xFFFFFFFF)>>(31-exp))) : fh, 0xC90FDAA2)^sign) - sign;
+		#endif
+		}
+
+		/// Get arguments for atan2 function.
+		/// \param abs half-precision floating-point value
+		/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
+		inline std::pair<uint32,uint32> atan2_args(unsigned int abs)
+		{
+			int exp = -15;
+			for(; abs<0x400; abs<<=1,--exp) ;
+			exp += abs >> 10;
+			uint32 my = ((abs&0x3FF)|0x400) << 5, r = my * my;
+			int rexp = 2 * exp;
+			r = 0x40000000 - ((rexp>-31) ? ((r>>-rexp)|((r&((static_cast<uint32>(1)<<-rexp)-1))!=0)) : 1);
+			for(rexp=0; r<0x40000000; r<<=1,--rexp) ;
+			uint32 mx = sqrt<30>(r, rexp);
+			int d = exp - rexp;
+			if(d < 0)
+				return std::make_pair((d<-14) ? ((my>>(-d-14))+((my>>(-d-15))&1)) : (my<<(14+d)), (mx<<14)+(r<<13)/mx);
+			if(d > 0)
+				return std::make_pair(my<<14, (d>14) ? ((mx>>(d-14))+((mx>>(d-15))&1)) : ((d==14) ? mx : ((mx<<(14-d))+(r<<(13-d))/mx)));
+			return std::make_pair(my<<13, (mx<<13)+(r<<12)/mx);
+		}
+
+		/// Get exponentials for hyperbolic computation
+		/// \param abs half-precision floating-point value
+		/// \param exp variable to take unbiased exponent of larger result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
+		inline std::pair<uint32,uint32> hyperbolic_args(unsigned int abs, int &exp, unsigned int n = 32)
+		{
+			uint32 mx = detail::multiply64(static_cast<uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29), my;
+			int e = (abs>>10) + (abs<=0x3FF);
+			if(e < 14)
+			{
+				exp = 0;
+				mx >>= 14 - e;
+			}
+			else
+			{
+				exp = mx >> (45-e);
+				mx = (mx<<(e-14)) & 0x7FFFFFFF;
+			}
+			mx = exp2(mx, n);
+			int d = exp << 1, s;
+			if(mx > 0x80000000)
+			{
+				my = divide64(0x80000000, mx, s);
+				my |= s;
+				++d;
+			}
+			else
+				my = mx;
+			return std::make_pair(mx, (d<31) ? ((my>>d)|((my&((static_cast<uint32>(1)<<d)-1))!=0)) : 1);
+		}
+
+		/// Postprocessing for binary exponential.
+		/// \tparam R rounding mode to use
+		/// \param m fractional part of as Q0.31
+		/// \param exp absolute value of unbiased exponent
+		/// \param esign sign of actual exponent
+		/// \param sign sign bit of result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R> unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0, unsigned int n = 32)
+		{
+			if(esign)
+			{
+				exp = -exp - (m!=0);
+				if(exp < -25)
+					return underflow<R>(sign);
+				else if(exp == -25)
+					return rounded<R,false>(sign, 1, m!=0);
+			}
+			else if(exp > 15)
+				return overflow<R>(sign);
+			if(!m)
+				return sign | (((exp+=15)>0) ? (exp<<10) : check_underflow(0x200>>-exp));
+			m = exp2(m, n);
+			int s = 0;
+			if(esign)
+				m = divide64(0x80000000, m, s);
+			return fixed2half<R,31,false,false,true>(m, exp+14, sign, s);
+		}
+
+		/// Postprocessing for binary logarithm.
+		/// \tparam R rounding mode to use
+		/// \tparam L logarithm for base transformation as Q1.31
+		/// \param m fractional part of logarithm as Q0.31
+		/// \param ilog signed integer part of logarithm
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return value base-transformed and converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,uint32 L> unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
+		{
+			uint32 msign = sign_mask(ilog);
+			m = (((static_cast<uint32>(ilog)<<27)+(m>>4))^msign) - msign;
+			if(!m)
+				return 0;
+			for(; m<0x80000000; m<<=1,--exp) ;
+			int i = m >= L, s;
+			exp += i;
+			m >>= 1 + i;
+			sign ^= msign & 0x8000;
+			if(exp < -11)
+				return underflow<R>(sign);
+			m = divide64(m, L, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, 1);
+		}
+
+		/// Hypotenuse square root and postprocessing.
+		/// \tparam R rounding mode to use
+		/// \param r mantissa as Q2.30
+		/// \param exp biased exponent
+		/// \return square root converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int hypot_post(uint32 r, int exp)
+		{
+			int i = r >> 31;
+			if((exp+=i) > 46)
+				return overflow<R>();
+			if(exp < -34)
+				return underflow<R>();
+			r = (r>>i) | (r&i);
+			uint32 m = sqrt<30>(r, exp+=15);
+			return fixed2half<R,15,false,false,false>(m, exp-1, 0, r!=0);
+		}
+
+		/// Division and postprocessing for tangents.
+		/// \tparam R rounding mode to use
+		/// \param my dividend as Q1.31
+		/// \param mx divisor as Q1.31
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return quotient converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R> unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
+		{
+			int i = my >= mx, s;
+			exp += i;
+			if(exp > 29)
+				return overflow<R>(sign);
+			if(exp < -11)
+				return underflow<R>(sign);
+			uint32 m = divide64(my>>(i+1), mx, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, s);
+		}
+
+		/// Area function and postprocessing.
+		/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) = log(x+sqrt(x^2+|-1))`.
+		/// \tparam R rounding mode to use
+		/// \tparam S `true` for asinh, `false` for acosh
+		/// \param arg half-precision argument
+		/// \return asinh|acosh(\a arg) converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool S> unsigned int area(unsigned int arg)
+		{
+			int abs = arg & 0x7FFF, expx = (abs>>10) + (abs<=0x3FF) - 15, expy = -15, ilog, i;
+			uint32 mx = static_cast<uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << 20, my, r;
+			for(; abs<0x400; abs<<=1,--expy) ;
+			expy += abs >> 10;
+			r = ((abs&0x3FF)|0x400) << 5;
+			r *= r;
+			i = r >> 31;
+			expy = 2*expy + i;
+			r >>= i;
+			if(S)
+			{
+				if(expy < 0)
+				{
+					r = 0x40000000 + ((expy>-30) ? ((r>>-expy)|((r&((static_cast<uint32>(1)<<-expy)-1))!=0)) : 1);
+					expy = 0;
+				}
+				else
+				{
+					r += 0x40000000 >> expy;
+					i = r >> 31;
+					r = (r>>i) | (r&i);
+					expy += i;
+				}
+			}
+			else
+			{
+				r -= 0x40000000 >> expy;
+				for(; r<0x40000000; r<<=1,--expy) ;
+			}
+			my = sqrt<30>(r, expy);
+			my = (my<<15) + (r<<14)/my;
+			if(S)
+			{
+				mx >>= expy - expx;
+				ilog = expy;
+			}
+			else
+			{
+				my >>= expx - expy;
+				ilog = expx;
+			}
+			my += mx;
+			i = my >> 31;
+			static const int G = S && (R==std::round_to_nearest);
+			return log2_post<R,0xB8AA3B2A>(log2(my>>i, 26+S+G)+(G<<3), ilog+i, 17, arg&(static_cast<unsigned>(S)<<15));
+		}
+
+		/// Class for 1.31 unsigned floating-point computation
+		struct f31
+		{
+			/// Constructor.
+			/// \param mant mantissa as 1.31
+			/// \param e exponent
+			HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
+
+			/// Constructor.
+			/// \param abs unsigned half-precision value
+			f31(unsigned int abs) : exp(-15)
+			{
+				for(; abs<0x400; abs<<=1,--exp) ;
+				m = static_cast<uint32>((abs&0x3FF)|0x400) << 21;
+				exp += (abs>>10);
+			}
+
+			/// Addition operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a + \a b
+			friend f31 operator+(f31 a, f31 b)
+			{
+				if(b.exp > a.exp)
+					std::swap(a, b);
+				int d = a.exp - b.exp;
+				uint32 m = a.m + ((d<32) ? (b.m>>d) : 0);
+				int i = (m&0xFFFFFFFF) < a.m;
+				return f31(((m+i)>>i)|0x80000000, a.exp+i);
+			}
+
+			/// Subtraction operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a - \a b
+			friend f31 operator-(f31 a, f31 b)
+			{
+				int d = a.exp - b.exp, exp = a.exp;
+				uint32 m = a.m - ((d<32) ? (b.m>>d) : 0);
+				if(!m)
+					return f31(0, -32);
+				for(; m<0x80000000; m<<=1,--exp) ;
+				return f31(m, exp);
+			}
+
+			/// Multiplication operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a * \a b
+			friend f31 operator*(f31 a, f31 b)
+			{
+				uint32 m = multiply64(a.m, b.m);
+				int i = m >> 31;
+				return f31(m<<(1-i), a.exp + b.exp + i);
+			}
+
+			/// Division operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a / \a b
+			friend f31 operator/(f31 a, f31 b)
+			{
+				int i = a.m >= b.m, s;
+				uint32 m = divide64((a.m+i)>>i, b.m, s);
+				return f31(m, a.exp - b.exp + i - 1);
+			}
+
+			uint32 m;			///< mantissa as 1.31.
+			int exp;			///< exponent.
+		};
+
+		/// Error function and postprocessing.
+		/// This computes the value directly in Q1.31 using the approximations given 
+		/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
+		/// \tparam R rounding mode to use
+		/// \tparam C `true` for comlementary error function, `false` else
+		/// \param arg half-precision function argument
+		/// \return approximated value of error function in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool C> unsigned int erf(unsigned int arg)
+		{
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			f31 x(abs), x2 = x * x * f31(0xB8AA3B29, 0), t = f31(0x80000000, 0) / (f31(0x80000000, 0)+f31(0xA7BA054A, -2)*x), t2 = t * t;
+			f31 e = ((f31(0x87DC2213, 0)*t2+f31(0xB5F0E2AE, 0))*t2+f31(0x82790637, -2)-(f31(0xBA00E2B8, 0)*t2+f31(0x91A98E62, -2))*t) * t /
+					((x2.exp<0) ? f31(exp2((x2.exp>-32) ? (x2.m>>-x2.exp) : 0, 30), 0) : f31(exp2((x2.m<<x2.exp)&0x7FFFFFFF, 22), x2.m>>(31-x2.exp)));
+			return (!C || sign) ? fixed2half<R,31,false,true,true>(0x80000000-(e.m>>(C-e.exp)), 14+C, sign&(C-1U)) :
+					(e.exp<-25) ? underflow<R>() : fixed2half<R,30,false,false,true>(e.m>>1, e.exp+14, 0, e.m&1);
+		}
+
+		/// Gamma function and postprocessing.
+		/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
+		/// \tparam R rounding mode to use
+		/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
+		/// \param arg half-precision floating-point value
+		/// \return lgamma/tgamma(\a arg) in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if \a arg is not a positive integer
+		template<std::float_round_style R,bool L> unsigned int gamma(unsigned int arg)
+		{
+/*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544, -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837, 0.0114684895434781459556 };
+			double t = arg + 4.65, s = p[0];
+			for(unsigned int i=0; i<5; ++i)
+				s += p[i+1] / (arg+i);
+			return std::log(s) + (arg-0.5)*std::log(t) - t;
+*/			static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			bool bsign = sign != 0;
+			f31 z(abs), x = sign ? (z+f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2), s =
+				f31(0xA06C9901, 1) + f31(0xBBE654E2, -7)/(x+f31(0x80000000, 2)) + f31(0xA1CE6098, 6)/(x+f31(0x80000000, 1))
+				+ f31(0xE1868CB7, 7)/x - f31(0x8625E279, 8)/(x+f31(0x80000000, 0)) - f31(0xA03E158F, 2)/(x+f31(0xC0000000, 1));
+			int i = (s.exp>=2) + (s.exp>=4) + (s.exp>=8) + (s.exp>=16);
+			s = f31((static_cast<uint32>(s.exp)<<(31-i))+(log2(s.m>>1, 28)>>i), i) / lbe;
+			if(x.exp != -1 || x.m != 0x80000000)
+			{
+				i = (t.exp>=2) + (t.exp>=4) + (t.exp>=8);
+				f31 l = f31((static_cast<uint32>(t.exp)<<(31-i))+(log2(t.m>>1, 30)>>i), i) / lbe;
+				s = (x.exp<-1) ? (s-(f31(0x80000000, -1)-x)*l) : (s+(x-f31(0x80000000, -1))*l);
+			}
+			s = x.exp ? (s-t) : (t-s);
+			if(bsign)
+			{
+				if(z.exp >= 0)
+				{
+					sign &= (L|((z.m>>(31-z.exp))&1)) - 1;
+					for(z=f31((z.m<<(1+z.exp))&0xFFFFFFFF, -1); z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				if(z.exp == -1)
+					z = f31(0x80000000, 0) - z;
+				if(z.exp < -1)
+				{
+					z = z * pi;
+					z.m = sincos(z.m>>(1-z.exp), 30).first;
+					for(z.exp=1; z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				else
+					z = f31(0x80000000, 0);
+			}
+			if(L)
+			{
+				if(bsign)
+				{
+					f31 l(0x92868247, 0);
+					if(z.exp < 0)
+					{
+						uint32 m = log2((z.m+1)>>1, 27);
+						z = f31(-((static_cast<uint32>(z.exp)<<26)+(m>>5)), 5);
+						for(; z.m<0x80000000; z.m<<=1,--z.exp) ;
+						l = l + z / lbe;
+					}
+					sign = static_cast<unsigned>(x.exp&&(l.exp<s.exp||(l.exp==s.exp&&l.m<s.m))) << 15;
+					s = sign ? (s-l) : x.exp ? (l-s) : (l+s);
+				}
+				else
+				{
+					sign = static_cast<unsigned>(x.exp==0) << 15;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+					if(s.exp > 15)
+						return overflow<R>(sign);
+				}
+			}
+			else
+			{
+				s = s * lbe;
+				uint32 m;
+				if(s.exp < 0)
+				{
+					m = s.m >> -s.exp;
+					s.exp = 0;
+				}
+				else
+				{
+					m = (s.m<<s.exp) & 0x7FFFFFFF;
+					s.exp = (s.m>>(31-s.exp));
+				}
+				s.m = exp2(m, 27);
+				if(!x.exp)
+					s = f31(0x80000000, 0) / s;
+				if(bsign)
+				{
+					if(z.exp < 0)
+						s = s * z;
+					s = pi / s;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+				}
+				else if(z.exp > 0 && !(z.m&((1<<(31-z.exp))-1)))
+					return ((s.exp+14)<<10) + (s.m>>21);
+				if(s.exp > 15)
+					return overflow<R>(sign);
+			}
+			return fixed2half<R,31,false,false,true>(s.m, s.exp+14, sign);
+		}
+		/// \}
+
+		template<typename,typename,std::float_round_style> struct half_caster;
+	}
+
+	/// Half-precision floating-point type.
+	/// This class implements an IEEE-conformant half-precision floating-point type with the usual arithmetic 
+	/// operators and conversions. It is implicitly convertible to single-precision floating-point, which makes artihmetic 
+	/// expressions and functions with mixed-type operands to be of the most precise operand type.
+	///
+	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and 
+	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which 
+	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the 
+	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of 
+	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most 
+	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit 
+	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if 
+	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on 
+	/// nearly any reasonable platform.
+	///
+	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable 
+	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+	class half
+	{
+	public:
+		/// \name Construction and assignment
+		/// \{
+
+		/// Default constructor.
+		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics 
+		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+		/// Conversion constructor.
+		/// \param rhs float to convert
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		explicit half(float rhs) : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs))) {}
+	
+		/// Conversion to single-precision.
+		/// \return single precision value representing expression value
+		operator float() const { return detail::half2float<float>(data_); }
+
+		/// Assignment operator.
+		/// \param rhs single-precision value to copy from
+		/// \return reference to this half
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		half& operator=(float rhs) { data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs)); return *this; }
+
+		/// \}
+		/// \name Arithmetic updates
+		/// \{
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator+(half,half)
+		half& operator+=(half rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator-(half,half)
+		half& operator-=(half rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator*(half,half)
+		half& operator*=(half rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator/(half,half)
+		half& operator/=(half rhs) { return *this = *this / rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator+=(float rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator-=(float rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator*=(float rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator/=(float rhs) { return *this = *this / rhs; }
+
+		/// \}
+		/// \name Increment and decrement
+		/// \{
+
+		/// Prefix increment.
+		/// \return incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
+
+		/// Prefix decrement.
+		/// \return decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
+
+		/// Postfix increment.
+		/// \return non-incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half operator++(int) { half out(*this); ++*this; return out; }
+
+		/// Postfix decrement.
+		/// \return non-decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half operator--(int) { half out(*this); --*this; return out; }
+		/// \}
+	
+	private:
+		/// Rounding mode to use
+		static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+		/// Constructor.
+		/// \param bits binary representation to set half to
+		HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT : data_(static_cast<detail::uint16>(bits)) {}
+
+		/// Internal binary representation
+		detail::uint16 data_;
+
+	#ifndef HALF_DOXYGEN_ONLY
+		friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
+		friend HALF_CONSTEXPR half operator-(half);
+		friend half operator+(half, half);
+		friend half operator-(half, half);
+		friend half operator*(half, half);
+		friend half operator/(half, half);
+		template<typename charT,typename traits> friend std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits>&, half);
+		template<typename charT,typename traits> friend std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits>&, half&);
+		friend HALF_CONSTEXPR half fabs(half);
+		friend half fmod(half, half);
+		friend half remainder(half, half);
+		friend half remquo(half, half, int*);
+		friend half fma(half, half, half);
+		friend HALF_CONSTEXPR_NOERR half fmax(half, half);
+		friend HALF_CONSTEXPR_NOERR half fmin(half, half);
+		friend half fdim(half, half);
+		friend half nanh(const char*);
+		friend half exp(half);
+		friend half exp2(half);
+		friend half expm1(half);
+		friend half log(half);
+		friend half log10(half);
+		friend half log2(half);
+		friend half log1p(half);
+		friend half sqrt(half);
+		friend half rsqrt(half);
+		friend half cbrt(half);
+		friend half hypot(half, half);
+		friend half hypot(half, half, half);
+		friend half pow(half, half);
+		friend void sincos(half, half*, half*);
+		friend half sin(half);
+		friend half cos(half);
+		friend half tan(half);
+		friend half asin(half);
+		friend half acos(half);
+		friend half atan(half);
+		friend half atan2(half, half);
+		friend half sinh(half);
+		friend half cosh(half);
+		friend half tanh(half);
+		friend half asinh(half);
+		friend half acosh(half);
+		friend half atanh(half);
+		friend half erf(half);
+		friend half erfc(half);
+		friend half lgamma(half);
+		friend half tgamma(half);
+		friend half ceil(half);
+		friend half floor(half);
+		friend half trunc(half);
+		friend half round(half);
+		friend long lround(half);
+		friend half rint(half);
+		friend long lrint(half);
+		friend half nearbyint(half);
+	#ifdef HALF_ENABLE_CPP11_LONG_LONG
+		friend long long llround(half);
+		friend long long llrint(half);
+	#endif
+		friend half frexp(half, int*);
+		friend half scalbln(half, long);
+		friend half modf(half, half*);
+		friend int ilogb(half);
+		friend half logb(half);
+		friend half nextafter(half, half);
+		friend half nexttoward(half, long double);
+		friend HALF_CONSTEXPR half copysign(half, half);
+		friend HALF_CONSTEXPR int fpclassify(half);
+		friend HALF_CONSTEXPR bool isfinite(half);
+		friend HALF_CONSTEXPR bool isinf(half);
+		friend HALF_CONSTEXPR bool isnan(half);
+		friend HALF_CONSTEXPR bool isnormal(half);
+		friend HALF_CONSTEXPR bool signbit(half);
+		friend HALF_CONSTEXPR bool isgreater(half, half);
+		friend HALF_CONSTEXPR bool isgreaterequal(half, half);
+		friend HALF_CONSTEXPR bool isless(half, half);
+		friend HALF_CONSTEXPR bool islessequal(half, half);
+		friend HALF_CONSTEXPR bool islessgreater(half, half);
+		template<typename,typename,std::float_round_style> friend struct detail::half_caster;
+		friend class std::numeric_limits<half>;
+	#if HALF_ENABLE_CPP11_HASH
+		friend struct std::hash<half>;
+	#endif
+	#if HALF_ENABLE_CPP11_USER_LITERALS
+		friend half literal::operator "" _h(long double);
+	#endif
+	#endif
+	};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	namespace literal
+	{
+		/// Half literal.
+		/// While this returns a properly rounded half-precision value, half literals can unfortunately not be constant 
+		/// expressions due to rather involved conversions. So don't expect this to be a literal literal without involving 
+		/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
+		/// \param value literal value
+		/// \return half with of given value (possibly rounded)
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		inline half operator "" _h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
+	}
+#endif
+
+	namespace detail
+	{
+		/// Helper class for half casts.
+		/// This class template has to be specialized for all valid cast arguments to define an appropriate static 
+		/// `cast` member function and a corresponding `type` member denoting its return type.
+		/// \tparam T destination type
+		/// \tparam U source type
+		/// \tparam R rounding mode to use
+		template<typename T,typename U,std::float_round_style R=(std::float_round_style)(HALF_ROUND_STYLE)> struct half_caster {};
+		template<typename U,std::float_round_style R> struct half_caster<half,U,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+		#endif
+
+			static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+		private:
+			static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+			static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+		};
+		template<typename T,std::float_round_style R> struct half_caster<T,half,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+		#endif
+
+			static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+		private:
+			static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+			static T cast_impl(half arg, false_type) { return half2int<R,true,true,T>(arg.data_); }
+		};
+		template<std::float_round_style R> struct half_caster<half,half,R>
+		{
+			static half cast(half arg) { return arg; }
+		};
+	}
+}
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+	/// Numeric limits for half-precision floats.
+	/// **See also:** Documentation for [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
+	template<> class numeric_limits<half_float::half>
+	{
+	public:
+		/// Is template specialization.
+		static HALF_CONSTEXPR_CONST bool is_specialized = true;
+
+		/// Supports signed values.
+		static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+		/// Is not an integer type.
+		static HALF_CONSTEXPR_CONST bool is_integer = false;
+
+		/// Is not exact.
+		static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+		/// Doesn't provide modulo arithmetic.
+		static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+		/// Has a finite set of values.
+		static HALF_CONSTEXPR_CONST bool is_bounded = true;
+
+		/// IEEE conformant.
+		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+		/// Supports infinity.
+		static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+		/// Supports quiet NaNs.
+		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+		/// Supports signaling NaNs.
+		static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
+
+		/// Supports subnormal values.
+		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+		/// Supports no denormalization detection.
+		static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
+
+	#if HALF_ERRHANDLING_THROWS
+		static HALF_CONSTEXPR_CONST bool traps = true;
+	#else
+		/// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+		static HALF_CONSTEXPR_CONST bool traps = false;
+	#endif
+
+		/// Does not support no pre-rounding underflow detection.
+		static HALF_CONSTEXPR_CONST bool tinyness_before = false;
+
+		/// Rounding mode.
+		static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
+
+		/// Significant digits.
+		static HALF_CONSTEXPR_CONST int digits = 11;
+
+		/// Significant decimal digits.
+		static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+		/// Required decimal digits to represent all possible values.
+		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+		/// Number base.
+		static HALF_CONSTEXPR_CONST int radix = 2;
+
+		/// One more than smallest exponent.
+		static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+		/// Smallest normalized representable power of 10.
+		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+		/// One more than largest exponent
+		static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+		/// Largest finitely representable power of 10.
+		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+		/// Smallest positive normal value.
+		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+
+		/// Smallest finite value.
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+
+		/// Largest finite value.
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+
+		/// Difference between 1 and next representable value.
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+
+		/// Maximum rounding error in ULP (units in the last place).
+		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
+
+		/// Positive infinity.
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+
+		/// Quiet NaN.
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+
+		/// Signaling NaN.
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+
+		/// Smallest positive subnormal value.
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+	};
+
+#if HALF_ENABLE_CPP11_HASH
+	/// Hash function for half-precision floats.
+	/// This is only defined if C++11 `std::hash` is supported and enabled.
+	///
+	/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
+	template<> struct hash<half_float::half>
+	{
+		/// Type of function argument.
+		typedef half_float::half argument_type;
+
+		/// Function return type.
+		typedef size_t result_type;
+
+		/// Compute hash function.
+		/// \param arg half to hash
+		/// \return hash value
+		result_type operator()(argument_type arg) const { return hash<half_float::detail::uint16>()(arg.data_&-static_cast<unsigned>(arg.data_!=0x8000)); }
+	};
+#endif
+}
+
+namespace half_float
+{
+	/// \anchor compop
+	/// \name Comparison operators
+	/// \{
+
+	/// Comparison for equality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) && (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for inequality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands not equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
+	{
+		return detail::compsignal(x.data_, y.data_) || (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for less than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for less equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// \}
+	/// \anchor arithmetics
+	/// \name Arithmetic operators
+	/// \{
+
+	/// Identity.
+	/// \param arg operand
+	/// \return unchanged operand
+	inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
+
+	/// Negation.
+	/// \param arg operand
+	/// \return negated operand
+	inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_^0x8000); }
+
+	/// Addition.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return sum of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator+(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)+detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
+		bool sub = ((x.data_^y.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) : (absy!=0x7C00) ? x.data_ :
+										(sub && absx==0x7C00) ? detail::invalid() : y.data_);
+		if(!absx)
+			return absy ? y : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (x.data_|y.data_) : (x.data_&y.data_));
+		if(!absy)
+			return x;
+		unsigned int sign = ((sub && absy>absx) ? y.data_ : x.data_) & 0x8000;
+		if(absy > absx)
+			std::swap(absx, absy);
+		int exp = (absx>>10) + (absx<=0x3FF), d = exp - (absy>>10) - (absy<=0x3FF), mx = ((absx&0x3FF)|((absx>0x3FF)<<10)) << 3, my;
+		if(d < 13)
+		{
+			my = ((absy&0x3FF)|((absy>0x3FF)<<10)) << 3;
+			my = (my>>d) | ((my&((1<<d)-1))!=0);
+		}
+		else
+			my = 1;
+		if(sub)
+		{
+			if(!(mx-=my))
+				return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+			for(; mx<0x2000 && exp>1; mx<<=1,--exp) ;
+		}
+		else
+		{
+			mx += my;
+			int i = mx >> 14;
+			if((exp+=i) > 30)
+				return half(detail::binary, detail::overflow<half::round_style>(sign));
+			mx = (mx>>i) | (mx&i);
+		}
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign+((exp-1)<<10)+(mx>>3), (mx>>2)&1, (mx&0x3)!=0));
+	#endif
+	}
+
+	/// Subtraction.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return difference of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator-(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)-detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		return x + -y;
+	#endif
+	}
+
+	/// Multiplication.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return product of half expressions
+	/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator*(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)*detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										((absx==0x7C00 && !absy)||(absy==0x7C00 && !absx)) ? detail::invalid() : (sign|0x7C00));
+		if(!absx || !absy)
+			return half(detail::binary, sign);
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = m >> 21, s = m & i;
+		exp += (absx>>10) + (absy>>10) + i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,20,false,false,false>(m>>i, exp, sign, s));
+	#endif
+	}
+
+	/// Division.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return quotient of half expressions
+	/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is signaling NaN
+	/// \exception FE_DIVBYZERO if dividing finite value by 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator/(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)/detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==absy) ? detail::invalid() : (sign|((absx==0x7C00) ? 0x7C00 : 0)));
+		if(!absx)
+			return half(detail::binary, absy ? sign : detail::invalid());
+		if(!absy)
+			return half(detail::binary, detail::pole(sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,++exp) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		int i = mx < my;
+		exp += (absx>>10) - (absy>>10) - i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		mx <<= 12 + i;
+		my <<= 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,false>(mx/my, exp, sign, mx%my!=0));
+	#endif
+	}
+
+	/// \}
+	/// \anchor streaming
+	/// \name Input and output
+	/// \{
+
+	/// Output operator.
+	///	This uses the built-in functionality for streaming out floating-point numbers.
+	/// \param out output stream to write into
+	/// \param arg half expression to write
+	/// \return reference to output stream
+	template<typename charT,typename traits> std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits> &out, half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return out << detail::half2float<detail::internal_t>(arg.data_);
+	#else
+		return out << detail::half2float<float>(arg.data_);
+	#endif
+	}
+
+	/// Input operator.
+	///	This uses the built-in functionality for streaming in floating-point numbers, specifically double precision floating 
+	/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the input string is first 
+	/// rounded to double precision using the underlying platform's current floating-point rounding mode before being rounded 
+	/// to half-precision using the library's half-precision rounding mode.
+	/// \param in input stream to read from
+	/// \param arg half to read into
+	/// \return reference to input stream
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename charT,typename traits> std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits> &in, half &arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f;
+	#else
+		double f;
+	#endif
+		if(in >> f)
+			arg.data_ = detail::float2half<half::round_style>(f);
+		return in;
+	}
+
+	/// \}
+	/// \anchor basic
+	/// \name Basic mathematical operations
+	/// \{
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_&0x7FFF); }
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half fmod(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(!absx)
+			return x;
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign|detail::mod<false,false>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remainder(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign^detail::mod<false,true>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param quo address to store some bits of quotient at
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remquo(half x, half y, int *quo)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		bool qsign = ((value^y.data_)&0x8000) != 0;
+		int q = 1;
+		if(absx != absy)
+			value ^= detail::mod<true, true>(absx, absy, &q);
+		return *quo = qsign ? -q : q, half(detail::binary, value);
+	}
+
+	/// Fused multiply add.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param z third operand
+	/// \return ( \a x * \a y ) + \a z rounded as one operation.
+	/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet NaN and no argument is a signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
+	inline half fma(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
+			return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(fx*fy+fz));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		bool sub = ((sign^z.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return	(absx>0x7C00 || absy>0x7C00 || absz>0x7C00) ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_)) :
+					(absx==0x7C00) ? half(detail::binary, (!absy || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) :
+					(absy==0x7C00) ? half(detail::binary, (!absx || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) : z;
+		if(!absx || !absy)
+			return absz ? z : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (z.data_|sign) : (z.data_&sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = m >> 21;
+		exp += (absx>>10) + (absy>>10) + i;
+		m <<= 3 - i;
+		if(absz)
+		{
+			int expz = 0;
+			for(; absz<0x400; absz<<=1,--expz) ;
+			expz += absz >> 10;
+			detail::uint32 mz = static_cast<detail::uint32>((absz&0x3FF)|0x400) << 13;
+			if(expz > exp || (expz == exp && mz > m))
+			{
+				std::swap(m, mz);
+				std::swap(exp, expz);
+				if(sub)
+					sign = z.data_ & 0x8000;
+			}
+			int d = exp - expz;
+			mz = (d<23) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+			if(sub)
+			{
+				m = m - mz;
+				if(!m)
+					return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+				for(; m<0x800000; m<<=1,--exp) ;
+			}
+			else
+			{
+				m += mz;
+				i = m >> 24;
+				m = (m>>i) | (m&i);
+				exp += i;
+			}
+		}
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,23,false,false,false>(m, exp-1, sign));
+	#endif
+	}
+
+	/// Maximum of half expressions.
+	/// **See also:** Documentation for [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return maximum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) < 
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Minimum of half expressions.
+	/// **See also:** Documentation for [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return minimum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) >
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Positive difference.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return \a x - \a y or 0 if difference negative
+	/// \exception FE_... according to operator-(half,half)
+	inline half fdim(half x, half y)
+	{
+		if(isnan(x) || isnan(y))
+			return half(detail::binary, detail::signal(x.data_, y.data_));
+		return (x.data_^(0x8000|(0x8000-(x.data_>>15)))) <= (y.data_^(0x8000|(0x8000-(y.data_>>15)))) ? half(detail::binary, 0) : (x-y);
+	}
+
+	/// Get NaN value.
+	/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
+	/// \param arg string code
+	/// \return quiet NaN
+	inline half nanh(const char *arg)
+	{
+		unsigned int value = 0x7FFF;
+		while(*arg)
+			value ^= static_cast<unsigned>(*arg++) & 0xFF;
+		return half(detail::binary, value);
+	}
+
+	/// \}
+	/// \anchor exponential
+	/// \name Exponential functions
+	/// \{
+
+	/// Exponential function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
+	/// \param arg function argument
+	/// \return e raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4C80)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> (45-e);
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(m, exp, (arg.data_&0x8000)!=0, 0, 26));
+	#endif
+	}
+
+	/// Binary exponential.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
+	/// \param arg function argument
+	/// \return 2 raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp = (abs&0x3FF) + ((abs>0x3FF)<<10);
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4E40)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		return half(detail::binary, detail::exp2_post<half::round_style>(
+			(static_cast<detail::uint32>(exp)<<(6+e))&0x7FFFFFFF, exp>>(25-e), (arg.data_&0x8000)!=0, 0, 28));
+	#endif
+	}
+
+	/// Exponential minus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in <1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
+	/// \param arg function argument
+	/// \return e raised to \a arg and subtracted by 1
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half expm1(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00+(sign>>1)) : detail::signal(arg.data_));
+		if(abs >= 0x4A00)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::rounded<half::round_style,true>(0xBBFF, 1, 1) : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> (45-e);
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		m = detail::exp2(m);
+		if(sign)
+		{
+			int s = 0;
+			if(m > 0x80000000)
+			{
+				++exp;
+				m = detail::divide64(0x80000000, m, s);
+			}
+			m = 0x80000000 - ((m>>exp)|((m&((static_cast<detail::uint32>(1)<<exp)-1))!=0)|s);
+			exp = 0;
+		}
+		else
+			m -= (exp<31) ? (0x80000000>>exp) : 1;
+		for(exp+=14; m<0x80000000 && exp; m<<=1,--exp) ;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::rounded<half::round_style,true>(sign+(exp<<10)+(m>>21), (m>>20)&1, (m&0xFFFFF)!=0));
+	#endif
+	}
+
+	/// Natural logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base e
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 17));
+	#endif
+	}
+
+	/// Common logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 10
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log10(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log10(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		switch(abs)
+		{
+			case 0x4900: return half(detail::binary, 0x3C00);
+			case 0x5640: return half(detail::binary, 0x4000);
+			case 0x63D0: return half(detail::binary, 0x4200);
+			case 0x70E2: return half(detail::binary, 0x4400);
+		}
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xD49A784C>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 16));
+	#endif
+	}
+
+	/// Binary logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 2
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += (abs>>10);
+		if(!(abs&0x3FF))
+		{
+			unsigned int value = static_cast<unsigned>(exp<0) << 15, m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			return half(detail::binary, value+(exp<<10)+m);
+		}
+		detail::uint32 ilog = exp, sign = detail::sign_mask(ilog), m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 28)>>4))^sign) - sign;
+		if(!m)
+			return half(detail::binary, 0);
+		for(exp=14; m<0x8000000 && exp; m<<=1,--exp) ;
+		for(; m>0xFFFFFFF; m>>=1,++exp)
+			s |= m & 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,27,false,false,true>(m, exp, sign&0x8000, s));
+	#endif
+	}
+
+	/// Natural logarithm plus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in ~1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
+	/// \param arg function argument
+	/// \return logarithm of \a arg plus 1 to base e
+	/// \exception FE_INVALID for signaling NaN or argument <-1
+	/// \exception FE_DIVBYZERO for -1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log1p(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		if(arg.data_ >= 0xBC00)
+			return half(detail::binary, (arg.data_==0xBC00) ? detail::pole(0x8000) : (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 20;
+		if(arg.data_ & 0x8000)
+		{
+			m = 0x40000000 - (m>>-exp);
+			for(exp=0; m<0x40000000; m<<=1,--exp) ;
+		}
+		else
+		{
+			if(exp < 0)
+			{
+				m = 0x40000000 + (m>>-exp);
+				exp = 0;
+			}
+			else
+			{
+				m += 0x40000000 >> exp;
+				int i = m >> 31;
+				m >>= i;
+				exp += i;
+			}
+		}
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(m), exp, 17));
+	#endif
+	}
+
+	/// \}
+	/// \anchor power
+	/// \name Power functions
+	/// \{
+
+	/// Square root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
+	/// \param arg function argument
+	/// \return square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half sqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 15;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ? detail::invalid() : arg.data_);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		detail::uint32 r = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 10, m = detail::sqrt<20>(r, exp+=abs>>10);
+		return half(detail::binary, detail::rounded<half::round_style,false>((exp<<10)+(m&0x3FF), r>m, r!=0));
+	#endif
+	}
+
+	/// Inverse square root.
+	/// This function is exact to rounding for all rounding modes and thus generally more accurate than directly computing 
+	/// 1 / sqrt(\a arg) in half-precision, in addition to also being faster.
+	/// \param arg function argument
+	/// \return reciprocal of square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half rsqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::internal_t(1)/std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, bias = 0x4000;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ?
+										detail::invalid() : !abs ? detail::pole(arg.data_&0x8000) : 0);
+		for(; abs<0x400; abs<<=1,bias-=0x400) ;
+		unsigned int frac = (abs+=bias) & 0x7FF;
+		if(frac == 0x400)
+			return half(detail::binary, 0x7A00-(abs>>1));
+		if((half::round_style == std::round_to_nearest && (frac == 0x3FE || frac == 0x76C)) ||
+		   (half::round_style != std::round_to_nearest && (frac == 0x15A || frac == 0x3FC || frac == 0x401 || frac == 0x402 || frac == 0x67B)))
+			return pow(arg, half(detail::binary, 0xB800));
+		detail::uint32 f = 0x17376 - abs, mx = (abs&0x3FF) | 0x400, my = ((f>>1)&0x3FF) | 0x400, mz = my * my;
+		int expy = (f>>11) - 31, expx = 32 - (abs>>10), i = mz >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=mz>>10) >> 31;
+		expy += i;
+		my = (my>>(20+i)) + 1;
+		i = (mz=my*my) >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=(mz>>10)+1) >> 31;
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,false,true>(my>>i, expy+i+14));
+	#endif
+	}
+
+	/// Cubic root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
+	/// \param arg function argument
+	/// \return cubic root of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT according to rounding
+	inline half cbrt(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs == 0x3C00 || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1, --exp);
+		detail::uint32 ilog = exp + (abs>>10), sign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 24)>>4))^sign) - sign;
+		for(exp=2; m<0x80000000; m<<=1,--exp) ;
+		m = detail::multiply64(m, 0xAAAAAAAB);
+		int i = m >> 31, s;
+		exp += i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = m >> (31-exp);
+		}
+		m = detail::exp2(f, (half::round_style==std::round_to_nearest) ? 29 : 26);
+		if(sign)
+		{
+			if(m > 0x80000000)
+			{
+				m = detail::divide64(0x80000000, m, s);
+				++exp;
+			}
+			exp = -exp;
+		}
+		return half(detail::binary, (half::round_style==std::round_to_nearest) ?
+			detail::fixed2half<half::round_style,31,false,false,false>(m, exp+14, arg.data_&0x8000) :
+			detail::fixed2half<half::round_style,23,false,false,false>((m+0x80)>>8, exp+14, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_);
+		#if HALF_ENABLE_CPP11_CMATH
+			return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy)));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, y.data_) :
+				(absy==0x7C00) ? detail::select(0x7C00, x.data_) : detail::signal(x.data_, y.data_));
+		if(!absx)
+			return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
+		if(!absy)
+			return half(detail::binary, detail::check_underflow(absx));
+		if(absy > absx)
+			std::swap(absx, absy);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		int ix = mx >> 21, iy = my >> 21;
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		int d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \param z third argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy+fz*fz)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0, expy = 0, expz = 0;
+		if(!absx)
+			return hypot(y, z);
+		if(!absy)
+			return hypot(x, z);
+		if(!absz)
+			return hypot(x, y);
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, detail::select(y.data_, z.data_)) :
+										(absy==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, z.data_)) :
+										(absz==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, y.data_)) :
+										detail::signal(x.data_, y.data_, z.data_));
+		if(absz > absy)
+			std::swap(absy, absz);
+		if(absy > absx)
+			std::swap(absx, absy);
+		if(absz > absy)
+			std::swap(absy, absz);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		for(; absz<0x400; absz<<=1,--expz) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400, mz = (absz&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		mz *= mz;
+		int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		expz = 2*(expz+(absz>>10)) - 15 + iz;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		mz <<= 10 - iz;
+		int d = expy - expz;
+		mz = (d<30) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		my += mz;
+		if(my & 0x80000000)
+		{
+			my = (my>>1) | (my&1);
+			if(++expy > expx)
+			{
+				std::swap(mx, my);
+				std::swap(expx, expy);
+			}
+		}
+		d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Power function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.00025% of inputs.
+	///
+	/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
+	/// \param x base
+	/// \param y exponent
+	/// \return \a x raised to \a y
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y is finite and not integral
+	/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half pow(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::pow(detail::half2float<detail::internal_t>(x.data_), detail::half2float<detail::internal_t>(y.data_))));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
+		if(!absy || x.data_ == 0x3C00)
+			return half(detail::binary, detail::select(0x3C00, (x.data_==0x3C00) ? y.data_ : x.data_));
+		bool is_int = absy >= 0x6400 || (absy>=0x3C00 && !(absy&((1<<(25-(absy>>10)))-1)));
+		unsigned int sign = x.data_ & (static_cast<unsigned>((absy<0x6800)&&is_int&&((absy>>(25-(absy>>10)))&1))<<15);
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absy==0x7C00) ? ((absx==0x3C00) ? 0x3C00 : (!absx && y.data_==0xFC00) ? detail::pole() :
+										(0x7C00&-((y.data_>>15)^(absx>0x3C00)))) : (sign|(0x7C00&((y.data_>>15)-1U))));
+		if(!absx)
+			return half(detail::binary, (y.data_&0x8000) ? detail::pole(sign) : sign);
+		if((x.data_&0x8000) && !is_int)
+			return half(detail::binary, detail::invalid());
+		if(x.data_ == 0xBC00)
+			return half(detail::binary, sign|0x3C00);
+		switch(y.data_)
+		{
+			case 0x3800: return sqrt(x);
+			case 0x3C00: return half(detail::binary, detail::check_underflow(x.data_));
+			case 0x4000: return x * x;
+			case 0xBC00: return half(detail::binary, 0x3C00) / x;
+		}
+		for(; absx<0x400; absx<<=1,--exp) ;
+		detail::uint32 ilog = exp + (absx>>10), msign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+((detail::log2(static_cast<detail::uint32>((absx&0x3FF)|0x400)<<20)+8)>>4))^msign) - msign;
+		for(exp=-11; m<0x80000000; m<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		m = detail::multiply64(m, static_cast<detail::uint32>((absy&0x3FF)|0x400)<<21);
+		int i = m >> 31;
+		exp += (absy>>10) + i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = m >> (31-exp);
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(f, exp, ((msign&1)^(y.data_>>15))!=0, sign));
+	#endif
+	}
+
+	/// \}
+	/// \anchor trigonometric
+	/// \name Trigonometric functions
+	/// \{
+
+	/// Compute sine and cosine simultaneously.
+	///	This returns the same results as sin() and cos() but is faster than calling each function individually.
+	///
+	/// This function is exact to rounding for all rounding modes.
+	/// \param arg function argument
+	/// \param sin variable to take sine of \a arg
+	/// \param cos variable to take cosine of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline void sincos(half arg, half *sin, half *cos)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
+		*sin = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
+		*cos = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
+	#else
+		int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
+		if(abs >= 0x7C00)
+			*sin = *cos = half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		else if(!abs)
+		{
+			*sin = arg;
+			*cos = half(detail::binary, 0x3C00);
+		}
+		else if(abs < 0x2500)
+		{
+			*sin = half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+			*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		}
+		else
+		{
+			if(half::round_style != std::round_to_nearest)
+			{
+				switch(abs)
+				{
+				case 0x48B7:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0xBBFF, 1, 1));
+					return;
+				case 0x598C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+					return;
+				case 0x6A64:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x27FF, 1, 1));
+					return;
+				case 0x6D8C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+					return;
+				}
+			}
+			std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+			switch(k & 3)
+			{
+				case 1: sc = std::make_pair(sc.second, -sc.first); break;
+				case 2: sc = std::make_pair(-sc.first, -sc.second); break;
+				case 3: sc = std::make_pair(-sc.second, sc.first); break;
+			}
+			*sin = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((sc.first^-static_cast<detail::uint32>(sign))+sign));
+			*cos = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>(sc.second));
+		}
+	#endif
+	}
+
+	/// Sine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
+	/// \param arg function argument
+	/// \return sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x48B7: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+				case 0x6A64: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+				case 0x6D8C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)&1)^(arg.data_>>15));
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.second : sc.first)^sign) - sign));
+	#endif
+	}
+
+	/// Cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
+	/// \param arg function argument
+	/// \return cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2500)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x598C)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)^k)&1);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.first : sc.second)^sign) - sign));
+	#endif
+	}
+
+	/// Tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
+	/// \param arg function argument
+	/// \return tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 13, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x658C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x07E6, 1, 1));
+				case 0x7330: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x4B62, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
+		if(k & 1)
+			sc = std::make_pair(-sc.second, sc.first);
+		detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
+		detail::uint32 my = (sc.first^signy) - signy, mx = (sc.second^signx) - signx;
+		for(; my<0x80000000; my<<=1,--exp) ;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp, (signy^signx^arg.data_)&0x8000));
+	#endif
+	}
+
+	/// Arc sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
+	/// \param arg function argument
+	/// \return arc sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::asin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_+1, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(sc.first, sc.second, (half::round_style==std::round_to_nearest) ? 27 : 26);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
+	/// \param arg function argument
+	/// \return arc cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::acos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
+		if(!abs)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3E48, 0, 1));
+		if(abs >= 0x3C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										sign ? detail::rounded<half::round_style,true>(0x4248, 0, 1) : 0);
+		std::pair<detail::uint32,detail::uint32> cs = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(cs.second, cs.first, 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(sign ? (0xC90FDAA2-m) : m, 15, 0, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
+	/// \param arg function argument
+	/// \return arc tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1) : detail::signal(arg.data_));
+		if(abs <= 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		int exp = (abs>>10) + (abs<=0x3FF);
+		detail::uint32 my = (abs&0x3FF) | ((abs>0x3FF)<<10);
+		detail::uint32 m = (exp>15) ?	detail::atan2(my<<19, 0x20000000>>(exp-15), (half::round_style==std::round_to_nearest) ? 26 : 24) :
+										detail::atan2(my<<(exp+4), 0x20000000, (half::round_style==std::round_to_nearest) ? 30 : 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for `std::round_to_nearest`, 
+	/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
+	/// \param y numerator
+	/// \param x denominator
+	/// \return arc tangent value
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan2(half y, half x)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan2(detail::half2float<detail::internal_t>(y.data_), detail::half2float<detail::internal_t>(x.data_))));
+	#else
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15, signy = y.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+		{
+			if(absx > 0x7C00 || absy > 0x7C00)
+				return half(detail::binary, detail::signal(x.data_, y.data_));
+			if(absy == 0x7C00)
+				return half(detail::binary, (absx<0x7C00) ?	detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1) :
+													signx ?	detail::rounded<half::round_style,true>(signy|0x40B6, 0, 1) :
+															detail::rounded<half::round_style,true>(signy|0x3A48, 0, 1));
+			return (x.data_==0x7C00) ? half(detail::binary, signy) : half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		}
+		if(!absy)
+			return signx ? half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1)) : y;
+		if(!absx)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		int d = (absy>>10) + (absy<=0x3FF) - (absx>>10) - (absx<=0x3FF);
+		if(d > (signx ? 18 : 12))
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		if(signx && d < -11)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		if(!signx && d < ((half::round_style==std::round_toward_zero) ? -15 : -9))
+		{
+			for(; absy<0x400; absy<<=1,--d) ;
+			detail::uint32 mx = ((absx<<1)&0x7FF) | 0x800, my = ((absy<<1)&0x7FF) | 0x800;
+			int i = my < mx;
+			d -= i;
+			if(d < -25)
+				return half(detail::binary, detail::underflow<half::round_style>(signy));
+			my <<= 11 + i;
+			return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,true>(my/mx, d+14, signy, my%mx!=0));
+		}
+		detail::uint32 m = detail::atan2(	((absy&0x3FF)|((absy>0x3FF)<<10))<<(19+((d<0) ? d : (d>0) ? 0 : -1)),
+											((absx&0x3FF)|((absx>0x3FF)<<10))<<(19-((d>0) ? d : (d<0) ? 0 : 1)));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(signx ? (0xC90FDAA2-m) : m, 15, signy, signx));
+	#endif
+	}
+
+	/// \}
+	/// \anchor hyperbolic
+	/// \name Hyperbolic functions
+	/// \{
+
+	/// Hyperbolic sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
+	/// \param arg function argument
+	/// \return hyperbolic sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sinh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 29 : 27);
+		detail::uint32 m = mm.first - mm.second;
+		for(exp+=13; m<0x80000000 && exp; m<<=1,--exp) ;
+		unsigned int sign = arg.data_ & 0x8000;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp, sign));
+	#endif
+	}
+
+	/// Hyperbolic cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
+	/// \param arg function argument
+	/// \return hyperbolic cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cosh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : 0x7C00);
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 23 : 26);
+		detail::uint32 m = mm.first + mm.second, i = (~m&0xFFFFFFFF) >> 31;
+		m = (m>>i) | (m&i) | 0x80000000;
+		if((exp+=13+i) > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp));
+	#endif
+	}
+
+	/// Hyperbolic tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
+	/// \param arg function argument
+	/// \return hyperbolic tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tanh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_-0x4000));
+		if(abs >= 0x4500)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-3, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
+		detail::uint32 my = mm.first - mm.second - (half::round_style!=std::round_to_nearest), mx = mm.first + mm.second, i = (~mx&0xFFFFFFFF) >> 31;
+		for(exp=13; my<0x80000000; my<<=1,--exp) ;
+		mx = (mx>>i) | 0x80000000;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp-i, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hyperbolic area sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
+	/// \param arg function argument
+	/// \return area sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asinh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x32D4: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-13, 1, 1));
+				case 0x3B5B: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-197, 1, 1));
+			}
+		return half(detail::binary, detail::area<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
+	/// \param arg function argument
+	/// \return area cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or arguments <1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acosh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if((arg.data_&0x8000) || abs < 0x3C00)
+			return half(detail::binary, (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		if(arg.data_ >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		return half(detail::binary, detail::area<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
+	/// \param arg function argument
+	/// \return area tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_DIVBYZERO for +/-1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atanh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 0;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs==0x3C00) ? detail::pole(arg.data_&0x8000) : (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << ((abs>>10)+(abs<=0x3FF)+6), my = 0x80000000 + m, mx = 0x80000000 - m;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		int i = my >= mx, s;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(
+			(detail::divide64(my>>i, mx, s)+1)>>1, 27)+0x10, exp+i-1, 16, arg.data_&0x8000));
+	#endif
+	}
+
+	/// \}
+	/// \anchor special
+	/// \name Error and gamma functions
+	/// \{
+
+	/// Error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
+	/// \param arg function argument
+	/// \return error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erf(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erf(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (arg.data_-0x4000) : detail::signal(arg.data_)) : arg;
+		if(abs >= 0x4200)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		return half(detail::binary, detail::erf<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Complementary error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
+	/// \param arg function argument
+	/// \return 1 minus error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erfc(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (sign>>1) : detail::signal(arg.data_)) : arg;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x4400)
+			return half(detail::binary, detail::rounded<half::round_style,true>((sign>>1)-(sign>>15), sign>>15, 1));
+		return half(detail::binary, detail::erf<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Natural logarithm of gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.025% of inputs.
+	///
+	/// **See also:** Documentation for [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
+	/// \param arg function argument
+	/// \return natural logarith of gamma function for \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0 or negative integer arguments
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half lgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		if(!abs || arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::pole());
+		if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
+			return half(detail::binary, 0);
+		return half(detail::binary, detail::gamma<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.25% of inputs.
+	///
+	/// **See also:** Documentation for [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
+	/// \param arg function argument
+	/// \return gamma function value of \a arg
+	/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs)
+			return half(detail::binary, detail::pole(arg.data_));
+		if(abs >= 0x7C00)
+			return (arg.data_==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::invalid());
+		if(arg.data_ >= 0xCA80)
+			return half(detail::binary, detail::underflow<half::round_style>((1-((abs>>(25-(abs>>10)))&1))<<15));
+		if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
+			return half(detail::binary, detail::overflow<half::round_style>());
+		if(arg.data_ == 0x3C00)
+			return arg;
+		return half(detail::binary, detail::gamma<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// \}
+	/// \anchor rounding
+	/// \name Rounding
+	/// \{
+
+	/// Nearest integer not less than half value.
+	/// **See also:** Documentation for [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
+	/// \param arg half to round
+	/// \return nearest integer not less than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half ceil(half arg) { return half(detail::binary, detail::integral<std::round_toward_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater than half value.
+	/// **See also:** Documentation for [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
+	/// \param arg half to round
+	/// \return nearest integer not greater than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half floor(half arg) { return half(detail::binary, detail::integral<std::round_toward_neg_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater in magnitude than half value.
+	/// **See also:** Documentation for [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
+	/// \param arg half to round
+	/// \return nearest integer not greater in magnitude than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half trunc(half arg) { return half(detail::binary, detail::integral<std::round_toward_zero,true,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half round(half arg) { return half(detail::binary, detail::integral<std::round_to_nearest,false,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long`
+	inline long lround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half rint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,true>(arg.data_)); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long lrint(half arg) { return detail::half2int<half::round_style,true,true,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	inline half nearbyint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,false>(arg.data_)); }
+#if HALF_ENABLE_CPP11_LONG_LONG
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long long`
+	inline long long llround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long long llrint(half arg) { return detail::half2int<half::round_style,true,true,long long>(arg.data_); }
+#endif
+
+	/// \}
+	/// \anchor float
+	/// \name Floating point manipulation
+	/// \{
+
+	/// Decompress floating-point number.
+	/// **See also:** Documentation for [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
+	/// \param arg number to decompress
+	/// \param exp address to store exponent at
+	/// \return significant in range [0.5, 1)
+	/// \exception FE_INVALID for signaling NaN
+	inline half frexp(half arg, int *exp)
+	{
+		*exp = 0;
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--*exp) ;
+		*exp += (abs>>10) - 14;
+		return half(detail::binary, (arg.data_&0x8000)|0x3800|(abs&0x3FF));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbln(half arg, long exp)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		else if(exp > 0)
+			return half(detail::binary, sign|(exp<<10)|(abs&0x3FF));
+		unsigned int m = (abs&0x3FF) | 0x400;
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign|(m>>(1-exp)), (m>>-exp)&1, (m&((1<<-exp)-1))!=0));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Extract integer and fractional parts.
+	/// **See also:** Documentation for [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
+	/// \param arg number to decompress
+	/// \param iptr address to store integer part at
+	/// \return fractional part
+	/// \exception FE_INVALID for signaling NaN
+	inline half modf(half arg, half *iptr)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs > 0x7C00)
+		{
+			arg = half(detail::binary, detail::signal(arg.data_));
+			return *iptr = arg, arg;
+		}
+		if(abs >= 0x6400)
+			return *iptr = arg, half(detail::binary, arg.data_&0x8000);
+		if(abs < 0x3C00)
+			return iptr->data_ = arg.data_ & 0x8000, arg;
+		unsigned int exp = abs >> 10, mask = (1<<(25-exp)) - 1, m = arg.data_ & mask;
+		iptr->data_ = arg.data_ & ~mask;
+		if(!m)
+			return half(detail::binary, arg.data_&0x8000);
+		for(; m<0x400; m<<=1,--exp) ;
+		return half(detail::binary, (arg.data_&0x8000)|(exp<<10)|(m&0x3FF));
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \retval FP_ILOGB0 for zero
+	/// \retval FP_ILOGBNAN for NaN
+	/// \retval INT_MAX for infinity
+	/// \exception FE_INVALID for 0 or infinite values
+	inline int ilogb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+		{
+			detail::raise(FE_INVALID);
+			return !abs ? FP_ILOGB0 : (abs==0x7C00) ? INT_MAX : FP_ILOGBNAN;
+		}
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		return exp;
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0
+	inline half logb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		unsigned int value = static_cast<unsigned>(exp<0) << 15;
+		if(exp)
+		{
+			unsigned int m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			value |= (exp<<10) + m;
+		}
+		return half(detail::binary, value);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nextafter(half from, half to)
+	{
+		int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+		if(fabs > 0x7C00 || tabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_, to.data_));
+		if(from.data_ == to.data_ || !(fabs|tabs))
+			return to;
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (to.data_&0x8000)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(
+			(from.data_^(0x8000|(0x8000-(from.data_>>15))))<(to.data_^(0x8000|(0x8000-(to.data_>>15))))))<<1) - 1;
+		detail::raise(FE_OVERFLOW, fabs<0x7C00 && (out&0x7C00)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7C00)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nexttoward(half from, long double to)
+	{
+		int fabs = from.data_ & 0x7FFF;
+		if(fabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_));
+		long double lfrom = static_cast<long double>(from);
+		if(detail::builtin_isnan(to) || lfrom == to)
+			return half(static_cast<float>(to));
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to))<<15)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(lfrom<to))<<1) - 1;
+		detail::raise(FE_OVERFLOW, (out&0x7FFF)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7FFF)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Take sign.
+	/// **See also:** Documentation for [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
+	/// \param x value to change sign for
+	/// \param y value to take sign from
+	/// \return value equal to \a x in magnitude and to \a y in sign
+	inline HALF_CONSTEXPR half copysign(half x, half y) { return half(detail::binary, x.data_^((x.data_^y.data_)&0x8000)); }
+
+	/// \}
+	/// \anchor classification
+	/// \name Floating point classification
+	/// \{
+
+	/// Classify floating-point value.
+	/// **See also:** Documentation for [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
+	/// \param arg number to classify
+	/// \retval FP_ZERO for positive and negative zero
+	/// \retval FP_SUBNORMAL for subnormal numbers
+	/// \retval FP_INFINITY for positive and negative infinity
+	/// \retval FP_NAN for NaNs
+	/// \retval FP_NORMAL for all other (normal) values
+	inline HALF_CONSTEXPR int fpclassify(half arg)
+	{
+		return	!(arg.data_&0x7FFF) ? FP_ZERO :
+				((arg.data_&0x7FFF)<0x400) ? FP_SUBNORMAL :
+				((arg.data_&0x7FFF)<0x7C00) ? FP_NORMAL :
+				((arg.data_&0x7FFF)==0x7C00) ? FP_INFINITE :
+				FP_NAN;
+	}
+
+	/// Check if finite number.
+	/// **See also:** Documentation for [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
+	/// \param arg number to check
+	/// \retval true if neither infinity nor NaN
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; }
+
+	/// Check for infinity.
+	/// **See also:** Documentation for [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
+	/// \param arg number to check
+	/// \retval true for positive or negative infinity
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; }
+
+	/// Check for NaN.
+	/// **See also:** Documentation for [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
+	/// \param arg number to check
+	/// \retval true for NaNs
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; }
+
+	/// Check if normal number.
+	/// **See also:** Documentation for [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
+	/// \param arg number to check
+	/// \retval true if normal number
+	/// \retval false if either subnormal, zero, infinity or NaN
+	inline HALF_CONSTEXPR bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); }
+
+	/// Check sign.
+	/// **See also:** Documentation for [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
+	/// \param arg number to check
+	/// \retval true for negative number
+	/// \retval false for positive number
+	inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_&0x8000) != 0; }
+
+	/// \}
+	/// \anchor compfunc
+	/// \name Comparison
+	/// \{
+
+	/// Quiet comparison for greater than.
+	/// **See also:** Documentation for [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreater(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for greater equal.
+	/// **See also:** Documentation for [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less than.
+	/// **See also:** Documentation for [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isless(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less equal.
+	/// **See also:** Documentation for [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comarison for less or greater.
+	/// **See also:** Documentation for [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if either less or greater
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessgreater(half x, half y)
+	{
+		return x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet check if unordered.
+	/// **See also:** Documentation for [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if unordered (one or two NaN operands)
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+	/// \}
+	/// \anchor casting
+	/// \name Casting
+	/// \{
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the default rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,typename U> T half_cast(U arg) { return detail::half_caster<T,U>::cast(arg); }
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the specified rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam R rounding mode to use.
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,std::float_round_style R,typename U> T half_cast(U arg) { return detail::half_caster<T,U,R>::cast(arg); }
+	/// \}
+
+	/// \}
+	/// \anchor errors
+	/// \name Error handling
+	/// \{
+
+	/// Clear exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
+	/// \param excepts OR of exceptions to clear
+	/// \retval 0 all selected flags cleared successfully
+	inline int feclearexcept(int excepts) { detail::errflags() &= ~excepts; return 0; }
+
+	/// Test exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
+	/// \param excepts OR of exceptions to test
+	/// \return OR of selected exceptions if raised
+	inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
+
+	/// Raise exception flags.
+	/// This raises the specified floating point exceptions and also invokes any additional automatic exception handling as 
+	/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
+	/// \param excepts OR of exceptions to raise
+	/// \retval 0 all selected exceptions raised successfully
+	inline int feraiseexcept(int excepts) { detail::errflags() |= excepts; detail::raise(excepts); return 0; }
+
+	/// Save exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to store flag state at
+	/// \param excepts OR of flags to save
+	/// \retval 0 for success
+	inline int fegetexceptflag(int *flagp, int excepts) { *flagp = detail::errflags() & excepts; return 0; }
+
+	/// Restore exception flags.
+	/// This only copies the specified exception state (including unset flags) without incurring any additional exception handling.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to take flag state from
+	/// \param excepts OR of flags to restore
+	/// \retval 0 for success
+	inline int fesetexceptflag(const int *flagp, int excepts) { detail::errflags() = (detail::errflags()|(*flagp&excepts)) & (*flagp|~excepts); return 0; }
+
+	/// Throw C++ exceptions based on set exception flags.
+	/// This function manually throws a corresponding C++ exception if one of the specified flags is set, 
+	/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	/// \param excepts OR of exceptions to test
+	/// \param msg error message to use for exception description
+	/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
+	/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
+	/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
+	/// \throw std::range_error if `FE_INEXACT` is selected and set
+	inline void fethrowexcept(int excepts, const char *msg = "")
+	{
+		excepts &= detail::errflags();
+		if(excepts & (FE_INVALID|FE_DIVBYZERO))
+			throw std::domain_error(msg);
+		if(excepts & FE_OVERFLOW)
+			throw std::overflow_error(msg);
+		if(excepts & FE_UNDERFLOW)
+			throw std::underflow_error(msg);
+		if(excepts & FE_INEXACT)
+			throw std::range_error(msg);
+	}
+	/// \}
+}
+
+
+#undef HALF_UNUSED_NOERR
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_CONSTEXPR_NOERR
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#undef HALF_THREAD_LOCAL
+#undef HALF_TWOS_COMPLEMENT_INT
+#ifdef HALF_POP_WARNINGS
+	#pragma warning(pop)
+	#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image1.png b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image1.png
new file mode 100644
index 000000000..22e3d775c
Binary files /dev/null and b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image1.png differ
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image2.png b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image2.png
new file mode 100644
index 000000000..fcfe5136d
Binary files /dev/null and b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/image2.png differ
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.cpp b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.cpp
new file mode 100644
index 000000000..43af8e73c
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.cpp
@@ -0,0 +1,6234 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#include "lodepng.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20170917";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size)
+{
+    return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size)
+{
+    return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr)
+{
+    free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code)\
+{\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code)\
+{\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call)\
+{\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code)\
+{\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector
+{
+    unsigned* data;
+    size_t size; /*size in number of unsigned longs*/
+    size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p)
+{
+    ((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+    lodepng_free(((uivector*)p)->data);
+    ((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size)
+{
+    if (!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value)
+{
+    size_t oldsize = p->size, i;
+    if (!uivector_resize(p, size)) return 0;
+    for (i = oldsize; i < size; ++i) p->data[i] = value;
+    return 1;
+}
+
+static void uivector_init(uivector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c)
+{
+    if (!uivector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector
+{
+    unsigned char* data;
+    size_t size; /*used size*/
+    size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned char*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size)
+{
+    if (!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p)
+{
+    ((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+    lodepng_free(((ucvector*)p)->data);
+    ((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size)
+{
+    p->data = buffer;
+    p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c)
+{
+    if (!ucvector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned string_resize(char** out, size_t size)
+{
+    char* data = (char*)lodepng_realloc(*out, size + 1);
+    if (data)
+    {
+        data[size] = 0; /*null termination char*/
+        *out = data;
+    }
+    return data != 0;
+}
+
+/*init a {char*, size_t} pair for use as string*/
+static void string_init(char** out)
+{
+    *out = NULL;
+    string_resize(out, 0);
+}
+
+/*free the above pair again*/
+static void string_cleanup(char** out)
+{
+    lodepng_free(*out);
+    *out = NULL;
+}
+
+static void string_set(char** out, const char* in)
+{
+    size_t insize = strlen(in), i;
+    if (string_resize(out, insize))
+    {
+        for (i = 0; i != insize; ++i)
+        {
+            (*out)[i] = in[i];
+        }
+    }
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer)
+{
+    return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value)
+{
+    buffer[0] = (unsigned char)((value >> 24) & 0xff);
+    buffer[1] = (unsigned char)((value >> 16) & 0xff);
+    buffer[2] = (unsigned char)((value >> 8) & 0xff);
+    buffer[3] = (unsigned char)((value) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value)
+{
+    ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+    lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename)
+{
+    FILE* file;
+    long size;
+    file = fopen(filename, "rb");
+    if (!file) return -1;
+
+    if (fseek(file, 0, SEEK_END) != 0)
+    {
+        fclose(file);
+        return -1;
+    }
+
+    size = ftell(file);
+    /* It may give LONG_MAX as directory size, this is invalid for us. */
+    if (size == LONG_MAX) size = -1;
+
+    fclose(file);
+    return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename)
+{
+    FILE* file;
+    size_t readsize;
+    file = fopen(filename, "rb");
+    if (!file) return 78;
+
+    readsize = fread(out, 1, size, file);
+    fclose(file);
+
+    if (readsize != size) return 78;
+    return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename)
+{
+    long size = lodepng_filesize(filename);
+    if (size < 0) return 78;
+    *outsize = (size_t)size;
+
+    *out = (unsigned char*)lodepng_malloc((size_t)size);
+    if (!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+    return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename)
+{
+    FILE* file;
+    file = fopen(filename, "wb");
+    if (!file) return 79;
+    fwrite((char*)buffer, 1, buffersize, file);
+    fclose(file);
+    return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit)\
+{\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0, i;
+    for (i = 0; i != nbits; ++i)
+    {
+        result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+        ++(*bitpointer);
+    }
+    return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+= { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+67, 83, 99, 115, 131, 163, 195, 227, 258 };
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+= { 0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+4,  4,  4,   4,   5,   5,   5,   5,   0 };
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+= { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577 };
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+= { 0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13 };
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+= { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree
+{
+    unsigned* tree2d;
+    unsigned* tree1d;
+    unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+    unsigned maxbitlen; /*maximum number of bits a single code can get*/
+    unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree)
+{
+std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+for(size_t i = 0; i != tree->tree1d.size; ++i)
+{
+if(tree->lengths.data[i])
+std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+}
+std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree)
+{
+    tree->tree2d = 0;
+    tree->tree1d = 0;
+    tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree)
+{
+    lodepng_free(tree->tree2d);
+    lodepng_free(tree->tree1d);
+    lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree)
+{
+    unsigned nodefilled = 0; /*up to which node it is filled*/
+    unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+    unsigned n, i;
+
+    tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+    if (!tree->tree2d) return 83; /*alloc fail*/
+
+                                  /*
+                                  convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+                                  uninited, a value >= numcodes is an address to another bit, a value < numcodes
+                                  is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+                                  many columns as codes - 1.
+                                  A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+                                  Here, the internal nodes are stored (what their 0 and 1 option point to).
+                                  There is only memory for such good tree currently, if there are more nodes
+                                  (due to too long length codes), error 55 will happen
+                                  */
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+    }
+
+    for (n = 0; n < tree->numcodes; ++n) /*the codes*/
+    {
+        for (i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/
+        {
+            unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+            /*oversubscribed, see comment in lodepng_error_text*/
+            if (treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+            if (tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/
+            {
+                if (i + 1 == tree->lengths[n]) /*last bit*/
+                {
+                    tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+                    treepos = 0;
+                }
+                else
+                {
+                    /*put address of the next step in here, first that address has to be found of course
+                    (it's just nodefilled + 1)...*/
+                    ++nodefilled;
+                    /*addresses encoded with numcodes added to it*/
+                    tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+                    treepos = nodefilled;
+                }
+            }
+            else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+        }
+    }
+
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        if (tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+    }
+
+    return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree)
+{
+    uivector blcount;
+    uivector nextcode;
+    unsigned error = 0;
+    unsigned bits, n;
+
+    uivector_init(&blcount);
+    uivector_init(&nextcode);
+
+    tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+    if (!tree->tree1d) error = 83; /*alloc fail*/
+
+    if (!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+        || !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+        error = 83; /*alloc fail*/
+
+    if (!error)
+    {
+        /*step 1: count number of instances of each code length*/
+        for (bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+        /*step 2: generate the nextcode values*/
+        for (bits = 1; bits <= tree->maxbitlen; ++bits)
+        {
+            nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+        }
+        /*step 3: generate all the codes*/
+        for (n = 0; n != tree->numcodes; ++n)
+        {
+            if (tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+        }
+    }
+
+    uivector_cleanup(&blcount);
+    uivector_cleanup(&nextcode);
+
+    if (!error) return HuffmanTree_make2DTree(tree);
+    else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned i;
+    tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+    for (i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->maxbitlen = maxbitlen;
+    return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode
+{
+    int weight; /*the sum of all weights in this chain*/
+    unsigned index; /*index of this leaf node (called "count" in the paper)*/
+    struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+    int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists
+{
+    /*memory pool*/
+    unsigned memsize;
+    BPMNode* memory;
+    unsigned numfree;
+    unsigned nextfree;
+    BPMNode** freelist;
+    /*two heads of lookahead chains per list*/
+    unsigned listsize;
+    BPMNode** chains0;
+    BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail)
+{
+    unsigned i;
+    BPMNode* result;
+
+    /*memory full, so garbage collect*/
+    if (lists->nextfree >= lists->numfree)
+    {
+        /*mark only those that are in use*/
+        for (i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+        for (i = 0; i != lists->listsize; ++i)
+        {
+            BPMNode* node;
+            for (node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+            for (node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+        }
+        /*collect those that are free*/
+        lists->numfree = 0;
+        for (i = 0; i != lists->memsize; ++i)
+        {
+            if (!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+        }
+        lists->nextfree = 0;
+    }
+
+    result = lists->freelist[lists->nextfree++];
+    result->weight = weight;
+    result->index = index;
+    result->tail = tail;
+    return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num)
+{
+    BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+    size_t width, counter = 0;
+    for (width = 1; width < num; width *= 2)
+    {
+        BPMNode* a = (counter & 1) ? mem : leaves;
+        BPMNode* b = (counter & 1) ? leaves : mem;
+        size_t p;
+        for (p = 0; p < num; p += 2 * width)
+        {
+            size_t q = (p + width > num) ? num : (p + width);
+            size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+            size_t i = p, j = q, k;
+            for (k = p; k < r; k++)
+            {
+                if (i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+                else b[k] = a[j++];
+            }
+        }
+        counter++;
+    }
+    if (counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+    lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num)
+{
+    unsigned lastindex = lists->chains1[c]->index;
+
+    if (c == 0)
+    {
+        if (lastindex >= numpresent) return;
+        lists->chains0[c] = lists->chains1[c];
+        lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+    }
+    else
+    {
+        /*sum of the weights of the head nodes of the previous lookahead chains.*/
+        int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+        lists->chains0[c] = lists->chains1[c];
+        if (lastindex < numpresent && sum > leaves[lastindex].weight)
+        {
+            lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+            return;
+        }
+        lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+        /*in the end we are only interested in the chain of the last list, so no
+        need to recurse if we're at the last one (this gives measurable speedup)*/
+        if (num + 1 < (int)(2 * numpresent - 2))
+        {
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+        }
+    }
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    unsigned i;
+    size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+    BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+    if (numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+    if ((1u << maxbitlen) < numcodes) return 80; /*error: represent all symbols*/
+
+    leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+    if (!leaves) return 83; /*alloc fail*/
+
+    for (i = 0; i != numcodes; ++i)
+    {
+        if (frequencies[i] > 0)
+        {
+            leaves[numpresent].weight = (int)frequencies[i];
+            leaves[numpresent].index = i;
+            ++numpresent;
+        }
+    }
+
+    for (i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+    /*ensure at least two present symbols. There should be at least one symbol
+    according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+    make these work as well ensure there are at least two symbols. The
+    Package-Merge code below also doesn't work correctly if there's only one
+    symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+    if (numpresent == 0)
+    {
+        lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+    }
+    else if (numpresent == 1)
+    {
+        lengths[leaves[0].index] = 1;
+        lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+    }
+    else
+    {
+        BPMLists lists;
+        BPMNode* node;
+
+        bpmnode_sort(leaves, numpresent);
+
+        lists.listsize = maxbitlen;
+        lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+        lists.nextfree = 0;
+        lists.numfree = lists.memsize;
+        lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+        lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+        lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        if (!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            for (i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+            bpmnode_create(&lists, leaves[0].weight, 1, 0);
+            bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+            for (i = 0; i != lists.listsize; ++i)
+            {
+                lists.chains0[i] = &lists.memory[0];
+                lists.chains1[i] = &lists.memory[1];
+            }
+
+            /*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+            for (i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+            for (node = lists.chains1[maxbitlen - 1]; node; node = node->tail)
+            {
+                for (i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+            }
+        }
+
+        lodepng_free(lists.memory);
+        lodepng_free(lists.freelist);
+        lodepng_free(lists.chains0);
+        lodepng_free(lists.chains1);
+    }
+
+    lodepng_free(leaves);
+    return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+    size_t mincodes, size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    while (!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+    tree->maxbitlen = maxbitlen;
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+                                   /*initialize all lengths to 0*/
+    memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+    error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+    if (!error) error = HuffmanTree_makeFromLengths2(tree);
+    return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index)
+{
+    return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index)
+{
+    return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+    for (i = 0; i <= 143; ++i) bitlen[i] = 8;
+    for (i = 144; i <= 255; ++i) bitlen[i] = 9;
+    for (i = 256; i <= 279; ++i) bitlen[i] = 7;
+    for (i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*there are 32 distance codes, but 30-31 are unused*/
+    for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+    const HuffmanTree* codetree, size_t inbitlength)
+{
+    unsigned treepos = 0, ct;
+    for (;;)
+    {
+        if (*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+                                                       /*
+                                                       decode the symbol from the tree. The "readBitFromStream" code is inlined in
+                                                       the expression below because this is the biggest bottleneck while decoding
+                                                       */
+        ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+        ++(*bp);
+        if (ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+        else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+        if (treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+    }
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d)
+{
+    /*TODO: check for out of memory errors*/
+    generateFixedLitLenTree(tree_ll);
+    generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+    const unsigned char* in, size_t* bp, size_t inlength)
+{
+    /*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+    unsigned error = 0;
+    unsigned n, HLIT, HDIST, HCLEN, i;
+    size_t inbitlength = inlength * 8;
+
+    /*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+    unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+    unsigned* bitlen_d = 0; /*dist code lengths*/
+                            /*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+    unsigned* bitlen_cl = 0;
+    HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+    if ((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+                                                 /*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+    HLIT = readBitsFromStream(bp, in, 5) + 257;
+    /*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+    HDIST = readBitsFromStream(bp, in, 5) + 1;
+    /*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+    HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+    if ((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+    HuffmanTree_init(&tree_cl);
+
+    while (!error)
+    {
+        /*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+        bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+        if (!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+        for (i = 0; i != NUM_CODE_LENGTH_CODES; ++i)
+        {
+            if (i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+            else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+        }
+
+        error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+        if (error) break;
+
+        /*now we can use this tree to read the lengths for the tree that this function will return*/
+        bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+        bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+        if (!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+        for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+        /*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+        i = 0;
+        while (i < HLIT + HDIST)
+        {
+            unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+            if (code <= 15) /*a length code*/
+            {
+                if (i < HLIT) bitlen_ll[i] = code;
+                else bitlen_d[i - HLIT] = code;
+                ++i;
+            }
+            else if (code == 16) /*repeat previous*/
+            {
+                unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+                unsigned value; /*set value to the previous code*/
+
+                if (i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+                if ((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 2);
+
+                if (i < HLIT + 1) value = bitlen_ll[i - 1];
+                else value = bitlen_d[i - HLIT - 1];
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+                    if (i < HLIT) bitlen_ll[i] = value;
+                    else bitlen_d[i - HLIT] = value;
+                    ++i;
+                }
+            }
+            else if (code == 17) /*repeat "0" 3-10 times*/
+            {
+                unsigned replength = 3; /*read in the bits that indicate repeat length*/
+                if ((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 3);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else if (code == 18) /*repeat "0" 11-138 times*/
+            {
+                unsigned replength = 11; /*read in the bits that indicate repeat length*/
+                if ((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 7);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+            {
+                if (code == (unsigned)(-1))
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inbitlength ? 10 : 11;
+                }
+                else error = 16; /*unexisting code, this can never happen*/
+                break;
+            }
+        }
+        if (error) break;
+
+        if (bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+                                                  /*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+        error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+        if (error) break;
+        error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+        break; /*end of error-while*/
+    }
+
+    lodepng_free(bitlen_cl);
+    lodepng_free(bitlen_ll);
+    lodepng_free(bitlen_d);
+    HuffmanTree_cleanup(&tree_cl);
+
+    return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+    size_t* pos, size_t inlength, unsigned btype)
+{
+    unsigned error = 0;
+    HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+    HuffmanTree tree_d; /*the huffman tree for distance codes*/
+    size_t inbitlength = inlength * 8;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    if (btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+    else if (btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+    while (!error) /*decode all symbols until end reached, breaks at end code*/
+    {
+        /*code_ll is literal, length or end code*/
+        unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+        if (code_ll <= 255) /*literal symbol*/
+        {
+            /*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+            if (!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+            out->data[*pos] = (unsigned char)code_ll;
+            ++(*pos);
+        }
+        else if (code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/
+        {
+            unsigned code_d, distance;
+            unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+            size_t start, forward, backward, length;
+
+            /*part 1: get length base*/
+            length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+            /*part 2: get extra bits and add the value of that to length*/
+            numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+            if ((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            length += readBitsFromStream(bp, in, numextrabits_l);
+
+            /*part 3: get distance code*/
+            code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+            if (code_d > 29)
+            {
+                if (code_d == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inlength * 8 ? 10 : 11;
+                }
+                else error = 18; /*error: invalid distance code (30-31 are never used)*/
+                break;
+            }
+            distance = DISTANCEBASE[code_d];
+
+            /*part 4: get extra bits from distance*/
+            numextrabits_d = DISTANCEEXTRA[code_d];
+            if ((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            distance += readBitsFromStream(bp, in, numextrabits_d);
+
+            /*part 5: fill in all the out[n] values based on the length and dist*/
+            start = (*pos);
+            if (distance > start) ERROR_BREAK(52); /*too long backward distance*/
+            backward = start - distance;
+
+            if (!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+            if (distance < length) {
+                for (forward = 0; forward < length; ++forward)
+                {
+                    out->data[(*pos)++] = out->data[backward++];
+                }
+            }
+            else {
+                memcpy(out->data + *pos, out->data + backward, length);
+                *pos += length;
+            }
+        }
+        else if (code_ll == 256)
+        {
+            break; /*end code, break the loop*/
+        }
+        else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+        {
+            /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+            (10=no endcode, 11=wrong jump outside of tree)*/
+            error = ((*bp) > inlength * 8) ? 10 : 11;
+            break;
+        }
+    }
+
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength)
+{
+    size_t p;
+    unsigned LEN, NLEN, n, error = 0;
+
+    /*go to first boundary of byte*/
+    while (((*bp) & 0x7) != 0) ++(*bp);
+    p = (*bp) / 8; /*byte position*/
+
+                   /*read LEN (2 bytes) and NLEN (2 bytes)*/
+    if (p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+    LEN = in[p] + 256u * in[p + 1]; p += 2;
+    NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+    /*check if 16-bit NLEN is really the one's complement of LEN*/
+    if (LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+    if (!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+                                                        /*read the literal data: LEN bytes are now stored in the out buffer*/
+    if (p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+    for (n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+    (*bp) = p * 8;
+
+    return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    /*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+    size_t bp = 0;
+    unsigned BFINAL = 0;
+    size_t pos = 0; /*byte position in the out buffer*/
+    unsigned error = 0;
+
+    (void)settings;
+
+    while (!BFINAL)
+    {
+        unsigned BTYPE;
+        if (bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+        BFINAL = readBitFromStream(&bp, in);
+        BTYPE = 1u * readBitFromStream(&bp, in);
+        BTYPE += 2u * readBitFromStream(&bp, in);
+
+        if (BTYPE == 3) return 20; /*error: invalid BTYPE*/
+        else if (BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+        else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+        if (error) return error;
+    }
+
+    return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_inflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_inflate)
+    {
+        return settings->custom_inflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_inflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen)
+{
+    addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value)
+{
+    /*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+    size_t left = 1;
+    size_t right = array_size - 1;
+
+    while (left <= right) {
+        size_t mid = (left + right) >> 1;
+        if (array[mid] >= value) right = mid - 1;
+        else left = mid + 1;
+    }
+    if (left >= array_size || array[left] > value) left--;
+    return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance)
+{
+    /*values in encoded vector are those used by deflate:
+    0-255: literal bytes
+    256: end
+    257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+    286-287: invalid*/
+
+    unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+    unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+    unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+    unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+    uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+    uivector_push_back(values, extra_length);
+    uivector_push_back(values, dist_code);
+    uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash
+{
+    int* head; /*hash value to head circular pos - can be outdated if went around window*/
+               /*circular pos to prev circular pos*/
+    unsigned short* chain;
+    int* val; /*circular pos to hash value*/
+
+              /*TODO: do this not only for zeros but for any repeated byte. However for PNG
+              it's always going to be the zeros that dominate, so not important for PNG*/
+    int* headz; /*similar to head, but for chainz*/
+    unsigned short* chainz; /*those with same amount of zeros*/
+    unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize)
+{
+    unsigned i;
+    hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+    hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+    hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+    hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+    hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    if (!hash->head || !hash->chain || !hash->val || !hash->headz || !hash->chainz || !hash->zeros)
+    {
+        return 83; /*alloc fail*/
+    }
+
+    /*initialize hash table*/
+    for (i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->val[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+    for (i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+    return 0;
+}
+
+static void hash_cleanup(Hash* hash)
+{
+    lodepng_free(hash->head);
+    lodepng_free(hash->val);
+    lodepng_free(hash->chain);
+
+    lodepng_free(hash->zeros);
+    lodepng_free(hash->headz);
+    lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos)
+{
+    unsigned result = 0;
+    if (pos + 2 < size)
+    {
+        /*A simple shift and xor hash is used. Since the data of PNGs is dominated
+        by zeroes due to the filters, a better hash does not have a significant
+        effect on speed in traversing the chain, and causes more time spend on
+        calculating the hash.*/
+        result ^= (unsigned)(data[pos + 0] << 0u);
+        result ^= (unsigned)(data[pos + 1] << 4u);
+        result ^= (unsigned)(data[pos + 2] << 8u);
+    }
+    else {
+        size_t amount, i;
+        if (pos >= size) return 0;
+        amount = size - pos;
+        for (i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+    }
+    return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos)
+{
+    const unsigned char* start = data + pos;
+    const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+    if (end > data + size) end = data + size;
+    data = start;
+    while (data != end && *data == 0) ++data;
+    /*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+    return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros)
+{
+    hash->val[wpos] = (int)hashval;
+    if (hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+    hash->head[hashval] = wpos;
+
+    hash->zeros[wpos] = numzeros;
+    if (hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+    hash->headz[numzeros] = wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+    const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+    unsigned minmatch, unsigned nicematch, unsigned lazymatching)
+{
+    size_t pos;
+    unsigned i, error = 0;
+    /*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+    unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+    unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+    unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+    unsigned numzeros = 0;
+
+    unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+    unsigned length;
+    unsigned lazy = 0;
+    unsigned lazylength = 0, lazyoffset = 0;
+    unsigned hashval;
+    unsigned current_offset, current_length;
+    unsigned prev_offset;
+    const unsigned char *lastptr, *foreptr, *backptr;
+    unsigned hashpos;
+
+    if (windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+    if ((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+    if (nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+    for (pos = inpos; pos < insize; ++pos)
+    {
+        size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+        unsigned chainlength = 0;
+
+        hashval = getHash(in, insize, pos);
+
+        if (usezeros && hashval == 0)
+        {
+            if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+            else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+        }
+        else
+        {
+            numzeros = 0;
+        }
+
+        updateHashChain(hash, wpos, hashval, numzeros);
+
+        /*the length and offset found for the current position*/
+        length = 0;
+        offset = 0;
+
+        hashpos = hash->chain[wpos];
+
+        lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+        /*search for the longest string*/
+        prev_offset = 0;
+        for (;;)
+        {
+            if (chainlength++ >= maxchainlength) break;
+            current_offset = hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize;
+
+            if (current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+            prev_offset = current_offset;
+            if (current_offset > 0)
+            {
+                /*test the next characters*/
+                foreptr = &in[pos];
+                backptr = &in[pos - current_offset];
+
+                /*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+                if (numzeros >= 3)
+                {
+                    unsigned skip = hash->zeros[hashpos];
+                    if (skip > numzeros) skip = numzeros;
+                    backptr += skip;
+                    foreptr += skip;
+                }
+
+                while (foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/
+                {
+                    ++backptr;
+                    ++foreptr;
+                }
+                current_length = (unsigned)(foreptr - &in[pos]);
+
+                if (current_length > length)
+                {
+                    length = current_length; /*the longest length*/
+                    offset = current_offset; /*the offset that is related to this longest length*/
+                                             /*jump out once a length of max length is found (speed gain). This also jumps
+                                             out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+                    if (current_length >= nicematch) break;
+                }
+            }
+
+            if (hashpos == hash->chain[hashpos]) break;
+
+            if (numzeros >= 3 && length > numzeros)
+            {
+                hashpos = hash->chainz[hashpos];
+                if (hash->zeros[hashpos] != numzeros) break;
+            }
+            else
+            {
+                hashpos = hash->chain[hashpos];
+                /*outdated hash value, happens if particular value was not encountered in whole last window*/
+                if (hash->val[hashpos] != (int)hashval) break;
+            }
+        }
+
+        if (lazymatching)
+        {
+            if (!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH)
+            {
+                lazy = 1;
+                lazylength = length;
+                lazyoffset = offset;
+                continue; /*try the next byte*/
+            }
+            if (lazy)
+            {
+                lazy = 0;
+                if (pos == 0) ERROR_BREAK(81);
+                if (length > lazylength + 1)
+                {
+                    /*push the previous character as literal*/
+                    if (!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+                }
+                else
+                {
+                    length = lazylength;
+                    offset = lazyoffset;
+                    hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+                    hash->headz[numzeros] = -1; /*idem*/
+                    --pos;
+                }
+            }
+        }
+        if (length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+        /*encode it as length/distance pair or literal value*/
+        if (length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
+        {
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else if (length < minmatch || (length == 3 && offset > 4096))
+        {
+            /*compensate for the fact that longer offsets have more extra bits, a
+            length of only 3 may be not worth it then*/
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else
+        {
+            addLengthDistance(out, length, offset);
+            for (i = 1; i < length; ++i)
+            {
+                ++pos;
+                wpos = pos & (windowsize - 1);
+                hashval = getHash(in, insize, pos);
+                if (usezeros && hashval == 0)
+                {
+                    if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+                    else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+                }
+                else
+                {
+                    numzeros = 0;
+                }
+                updateHashChain(hash, wpos, hashval, numzeros);
+            }
+        }
+    } /*end of the loop through each character of input*/
+
+    return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize)
+{
+    /*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+    2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+    size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+    unsigned datapos = 0;
+    for (i = 0; i != numdeflateblocks; ++i)
+    {
+        unsigned BFINAL, BTYPE, LEN, NLEN;
+        unsigned char firstbyte;
+
+        BFINAL = (i == numdeflateblocks - 1);
+        BTYPE = 0;
+
+        firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+        ucvector_push_back(out, firstbyte);
+
+        LEN = 65535;
+        if (datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+        NLEN = 65535 - LEN;
+
+        ucvector_push_back(out, (unsigned char)(LEN & 255));
+        ucvector_push_back(out, (unsigned char)(LEN >> 8));
+        ucvector_push_back(out, (unsigned char)(NLEN & 255));
+        ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+        /*Decompressed data*/
+        for (j = 0; j < 65535 && datapos < datasize; ++j)
+        {
+            ucvector_push_back(out, data[datapos++]);
+        }
+    }
+
+    return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+    const HuffmanTree* tree_ll, const HuffmanTree* tree_d)
+{
+    size_t i = 0;
+    for (i = 0; i != lz77_encoded->size; ++i)
+    {
+        unsigned val = lz77_encoded->data[i];
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+        if (val > 256) /*for a length code, 3 more things have to be added*/
+        {
+            unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+            unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+            unsigned length_extra_bits = lz77_encoded->data[++i];
+
+            unsigned distance_code = lz77_encoded->data[++i];
+
+            unsigned distance_index = distance_code;
+            unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+            unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+            addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+                HuffmanTree_getLength(tree_d, distance_code));
+            addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+        }
+    }
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data, size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    unsigned error = 0;
+
+    /*
+    A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+    literal bytes and length/distance pairs. This is then huffman compressed with
+    two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+    another huffman tree is used for the dist values ("d"). These two trees are
+    stored using their code lengths, and to compress even more these code lengths
+    are also run-length encoded and huffman compressed. This gives a huffman tree
+    of code lengths "cl". The code lenghts used to describe this third tree are
+    the code length code lengths ("clcl").
+    */
+
+    /*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+    uivector lz77_encoded;
+    HuffmanTree tree_ll; /*tree for lit,len values*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+    HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+    uivector frequencies_ll; /*frequency of lit,len codes*/
+    uivector frequencies_d; /*frequency of dist codes*/
+    uivector frequencies_cl; /*frequency of code length codes*/
+    uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+    uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+                           /*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+                           (these are written as is in the file, it would be crazy to compress these using yet another huffman
+                           tree that needs to be represented by yet another set of code lengths)*/
+    uivector bitlen_cl;
+    size_t datasize = dataend - datapos;
+
+    /*
+    Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+    bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+    bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+    bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+    */
+
+    unsigned BFINAL = final;
+    size_t numcodes_ll, numcodes_d, i;
+    unsigned HLIT, HDIST, HCLEN;
+
+    uivector_init(&lz77_encoded);
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+    HuffmanTree_init(&tree_cl);
+    uivector_init(&frequencies_ll);
+    uivector_init(&frequencies_d);
+    uivector_init(&frequencies_cl);
+    uivector_init(&bitlen_lld);
+    uivector_init(&bitlen_lld_e);
+    uivector_init(&bitlen_cl);
+
+    /*This while loop never loops due to a break at the end, it is here to
+    allow breaking out of it to the cleanup phase on error conditions.*/
+    while (!error)
+    {
+        if (settings->use_lz77)
+        {
+            error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                settings->minmatch, settings->nicematch, settings->lazymatching);
+            if (error) break;
+        }
+        else
+        {
+            if (!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+            for (i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+        }
+
+        if (!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        if (!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+        /*Count the frequencies of lit, len and dist codes*/
+        for (i = 0; i != lz77_encoded.size; ++i)
+        {
+            unsigned symbol = lz77_encoded.data[i];
+            ++frequencies_ll.data[symbol];
+            if (symbol > 256)
+            {
+                unsigned dist = lz77_encoded.data[i + 2];
+                ++frequencies_d.data[dist];
+                i += 3;
+            }
+        }
+        frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+                                      /*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+        error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+        if (error) break;
+        /*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+        error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+        if (error) break;
+
+        numcodes_ll = tree_ll.numcodes; if (numcodes_ll > 286) numcodes_ll = 286;
+        numcodes_d = tree_d.numcodes; if (numcodes_d > 30) numcodes_d = 30;
+        /*store the code lengths of both generated trees in bitlen_lld*/
+        for (i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+        for (i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+        /*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+        17 (3-10 zeroes), 18 (11-138 zeroes)*/
+        for (i = 0; i != (unsigned)bitlen_lld.size; ++i)
+        {
+            unsigned j = 0; /*amount of repititions*/
+            while (i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+            if (bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/
+            {
+                ++j; /*include the first zero*/
+                if (j <= 10) /*repeat code 17 supports max 10 zeroes*/
+                {
+                    uivector_push_back(&bitlen_lld_e, 17);
+                    uivector_push_back(&bitlen_lld_e, j - 3);
+                }
+                else /*repeat code 18 supports max 138 zeroes*/
+                {
+                    if (j > 138) j = 138;
+                    uivector_push_back(&bitlen_lld_e, 18);
+                    uivector_push_back(&bitlen_lld_e, j - 11);
+                }
+                i += (j - 1);
+            }
+            else if (j >= 3) /*repeat code for value other than zero*/
+            {
+                size_t k;
+                unsigned num = j / 6, rest = j % 6;
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+                for (k = 0; k < num; ++k)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, 6 - 3);
+                }
+                if (rest >= 3)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, rest - 3);
+                }
+                else j -= rest;
+                i += j;
+            }
+            else /*too short to benefit from repeat code*/
+            {
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+            }
+        }
+
+        /*generate tree_cl, the huffmantree of huffmantrees*/
+
+        if (!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            ++frequencies_cl.data[bitlen_lld_e.data[i]];
+            /*after a repeat code come the bits that specify the number of repetitions,
+            those don't need to be in the frequencies_cl calculation*/
+            if (bitlen_lld_e.data[i] >= 16) ++i;
+        }
+
+        error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+            frequencies_cl.size, frequencies_cl.size, 7);
+        if (error) break;
+
+        if (!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != tree_cl.numcodes; ++i)
+        {
+            /*lenghts of code length tree is in the order as specified by deflate*/
+            bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+        }
+        while (bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4)
+        {
+            /*remove zeros at the end, but minimum size must be 4*/
+            if (!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        if (error) break;
+
+        /*
+        Write everything into the output
+
+        After the BFINAL and BTYPE, the dynamic block consists out of the following:
+        - 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+        - (HCLEN+4)*3 bits code lengths of code length alphabet
+        - HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - HDIST + 1 code lengths of distance alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - compressed data
+        - 256 (end code)
+        */
+
+        /*Write block type*/
+        addBitToStream(bp, out, BFINAL);
+        addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+        addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+                                    /*write the HLIT, HDIST and HCLEN values*/
+        HLIT = (unsigned)(numcodes_ll - 257);
+        HDIST = (unsigned)(numcodes_d - 1);
+        HCLEN = (unsigned)bitlen_cl.size - 4;
+        /*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+        while (!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+        addBitsToStream(bp, out, HLIT, 5);
+        addBitsToStream(bp, out, HDIST, 5);
+        addBitsToStream(bp, out, HCLEN, 4);
+
+        /*write the code lenghts of the code length alphabet*/
+        for (i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+        /*write the lenghts of the lit/len AND the dist alphabet*/
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+                HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+            /*extra bits of repeat codes*/
+            if (bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+            else if (bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+            else if (bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+        }
+
+        /*write the compressed data symbols*/
+        writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        /*error: the length of the end code 256 must be larger than 0*/
+        if (HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+        /*write the end code*/
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+        break; /*end of error-while*/
+    }
+
+    /*cleanup*/
+    uivector_cleanup(&lz77_encoded);
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+    HuffmanTree_cleanup(&tree_cl);
+    uivector_cleanup(&frequencies_ll);
+    uivector_cleanup(&frequencies_d);
+    uivector_cleanup(&frequencies_cl);
+    uivector_cleanup(&bitlen_lld_e);
+    uivector_cleanup(&bitlen_lld);
+    uivector_cleanup(&bitlen_cl);
+
+    return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data,
+    size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    HuffmanTree tree_ll; /*tree for literal values and length codes*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+
+    unsigned BFINAL = final;
+    unsigned error = 0;
+    size_t i;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    generateFixedLitLenTree(&tree_ll);
+    generateFixedDistanceTree(&tree_d);
+
+    addBitToStream(bp, out, BFINAL);
+    addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+    addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+    if (settings->use_lz77) /*LZ77 encoded*/
+    {
+        uivector lz77_encoded;
+        uivector_init(&lz77_encoded);
+        error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+            settings->minmatch, settings->nicematch, settings->lazymatching);
+        if (!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        uivector_cleanup(&lz77_encoded);
+    }
+    else /*no LZ77, but still will be Huffman compressed*/
+    {
+        for (i = datapos; i < dataend; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+        }
+    }
+    /*add END code*/
+    if (!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+    /*cleanup*/
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error = 0;
+    size_t i, blocksize, numdeflateblocks;
+    size_t bp = 0; /*the bit pointer*/
+    Hash hash;
+
+    if (settings->btype > 2) return 61;
+    else if (settings->btype == 0) return deflateNoCompression(out, in, insize);
+    else if (settings->btype == 1) blocksize = insize;
+    else /*if(settings->btype == 2)*/
+    {
+        /*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+        blocksize = insize / 8 + 8;
+        if (blocksize < 65536) blocksize = 65536;
+        if (blocksize > 262144) blocksize = 262144;
+    }
+
+    numdeflateblocks = (insize + blocksize - 1) / blocksize;
+    if (numdeflateblocks == 0) numdeflateblocks = 1;
+
+    error = hash_init(&hash, settings->windowsize);
+    if (error) return error;
+
+    for (i = 0; i != numdeflateblocks && !error; ++i)
+    {
+        unsigned final = (i == numdeflateblocks - 1);
+        size_t start = i * blocksize;
+        size_t end = start + blocksize;
+        if (end > insize) end = insize;
+
+        if (settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+        else if (settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+    }
+
+    hash_cleanup(&hash);
+
+    return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_deflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_deflate)
+    {
+        return settings->custom_deflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_deflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len)
+{
+    unsigned s1 = adler & 0xffff;
+    unsigned s2 = (adler >> 16) & 0xffff;
+
+    while (len > 0)
+    {
+        /*at least 5550 sums can be done before the sums overflow, saving a lot of module divisions*/
+        unsigned amount = len > 5550 ? 5550 : len;
+        len -= amount;
+        while (amount > 0)
+        {
+            s1 += (*data++);
+            s2 += s1;
+            --amount;
+        }
+        s1 %= 65521;
+        s2 %= 65521;
+    }
+
+    return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len)
+{
+    return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    unsigned error = 0;
+    unsigned CM, CINFO, FDICT;
+
+    if (insize < 2) return 53; /*error, size of zlib data too small*/
+                               /*read information from zlib header*/
+    if ((in[0] * 256 + in[1]) % 31 != 0)
+    {
+        /*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+        return 24;
+    }
+
+    CM = in[0] & 15;
+    CINFO = (in[0] >> 4) & 15;
+    /*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+    FDICT = (in[1] >> 5) & 1;
+    /*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+    if (CM != 8 || CINFO > 7)
+    {
+        /*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+        return 25;
+    }
+    if (FDICT != 0)
+    {
+        /*error: the specification of PNG says about the zlib stream:
+        "The additional flags shall not specify a preset dictionary."*/
+        return 26;
+    }
+
+    error = inflate(out, outsize, in + 2, insize - 2, settings);
+    if (error) return error;
+
+    if (!settings->ignore_adler32)
+    {
+        unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+        unsigned checksum = adler32(*out, (unsigned)(*outsize));
+        if (checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+    }
+
+    return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    /*initially, *out must be NULL and outsize 0, if you just give some random *out
+    that's pointing to a non allocated buffer, this'll crash*/
+    ucvector outv;
+    size_t i;
+    unsigned error;
+    unsigned char* deflatedata = 0;
+    size_t deflatesize = 0;
+
+    /*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+    unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+    unsigned FLEVEL = 0;
+    unsigned FDICT = 0;
+    unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+    unsigned FCHECK = 31 - CMFFLG % 31;
+    CMFFLG += FCHECK;
+
+    /*ucvector-controlled version of the output buffer, for dynamic array*/
+    ucvector_init_buffer(&outv, *out, *outsize);
+
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+    error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+    if (!error)
+    {
+        unsigned ADLER32 = adler32(in, (unsigned)insize);
+        for (i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+        lodepng_free(deflatedata);
+        lodepng_add32bitInt(&outv, ADLER32);
+    }
+
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_compress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings)
+{
+    /*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+    settings->btype = 2;
+    settings->use_lz77 = 1;
+    settings->windowsize = DEFAULT_WINDOWSIZE;
+    settings->minmatch = 3;
+    settings->nicematch = 128;
+    settings->lazymatching = 1;
+
+    settings->custom_zlib = 0;
+    settings->custom_deflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = { 2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0 };
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings)
+{
+    settings->ignore_adler32 = 0;
+
+    settings->custom_zlib = 0;
+    settings->custom_inflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = { 0, 0, 0, 0 };
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+    0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+    249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+    498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+    325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+    997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+    901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+    651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+    671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+    1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+    2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+    1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+    1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+    1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+    1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+    1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+    1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+    3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+    3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+    4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+    4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+    3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+    3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+    3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+    3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+    2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+    2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+    2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+    2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+    2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+    2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+    3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+    3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length)
+{
+    unsigned r = 0xffffffffu;
+    size_t i;
+    for (i = 0; i < length; ++i)
+    {
+        r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+    }
+    return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0;
+    size_t i;
+    for (i = 0; i < nbits; ++i)
+    {
+        result <<= 1;
+        result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+    }
+    return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream must be 0 for this to work*/
+    if (bit)
+    {
+        /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+        bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+    }
+    ++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream may be 0 or 1 for this to work*/
+    if (bit == 0) bitstream[(*bitpointer) >> 3] &= (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+    else         bitstream[(*bitpointer) >> 3] |= (1 << (7 - ((*bitpointer) & 0x7)));
+    ++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk)
+{
+    return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk)
+{
+    unsigned i;
+    for (i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+    type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type)
+{
+    if (strlen(type) != 4) return 0;
+    return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk)
+{
+    return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk)
+{
+    return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk)
+{
+    return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+    /*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+    unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+    if (CRC != checksum) return 1;
+    else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+    lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk)
+{
+    unsigned i;
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    unsigned char *chunk_start, *new_buffer;
+    size_t new_length = (*outlength) + total_chunk_length;
+    if (new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk_start = &(*out)[new_length - total_chunk_length];
+
+    for (i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+    return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data)
+{
+    unsigned i;
+    unsigned char *chunk, *new_buffer;
+    size_t new_length = (*outlength) + length + 12;
+    if (new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk = &(*out)[(*outlength) - length - 12];
+
+    /*1: length*/
+    lodepng_set32bitInt(chunk, (unsigned)length);
+
+    /*2: chunk name (4 letters)*/
+    chunk[4] = (unsigned char)type[0];
+    chunk[5] = (unsigned char)type[1];
+    chunk[6] = (unsigned char)type[2];
+    chunk[7] = (unsigned char)type[3];
+
+    /*3: the data*/
+    for (i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+    /*4: CRC (of the chunkname characters and the data)*/
+    lodepng_chunk_generate_crc(chunk);
+
+    return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/
+{
+    switch (colortype)
+    {
+    case 0: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*grey*/
+    case 2: if (!(bd == 8 || bd == 16)) return 37; break; /*RGB*/
+    case 3: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8)) return 37; break; /*palette*/
+    case 4: if (!(bd == 8 || bd == 16)) return 37; break; /*grey + alpha*/
+    case 6: if (!(bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+    default: return 31;
+    }
+    return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype)
+{
+    switch (colortype)
+    {
+    case 0: return 1; /*grey*/
+    case 2: return 3; /*RGB*/
+    case 3: return 1; /*palette*/
+    case 4: return 2; /*grey + alpha*/
+    case 6: return 4; /*RGBA*/
+    }
+    return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*bits per pixel is amount of channels * bits per channel*/
+    return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info)
+{
+    info->key_defined = 0;
+    info->key_r = info->key_g = info->key_b = 0;
+    info->colortype = LCT_RGBA;
+    info->bitdepth = 8;
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info)
+{
+    lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source)
+{
+    size_t i;
+    lodepng_color_mode_cleanup(dest);
+    *dest = *source;
+    if (source->palette)
+    {
+        dest->palette = (unsigned char*)lodepng_malloc(1024);
+        if (!dest->palette && source->palettesize) return 83; /*alloc fail*/
+        for (i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+    }
+    return 0;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b)
+{
+    size_t i;
+    if (a->colortype != b->colortype) return 0;
+    if (a->bitdepth != b->bitdepth) return 0;
+    if (a->key_defined != b->key_defined) return 0;
+    if (a->key_defined)
+    {
+        if (a->key_r != b->key_r) return 0;
+        if (a->key_g != b->key_g) return 0;
+        if (a->key_b != b->key_b) return 0;
+    }
+    /*if one of the palette sizes is 0, then we consider it to be the same as the
+    other: it means that e.g. the palette was not given by the user and should be
+    considered the same as the palette inside the PNG.*/
+    if (1/*a->palettesize != 0 && b->palettesize != 0*/) {
+        if (a->palettesize != b->palettesize) return 0;
+        for (i = 0; i != a->palettesize * 4; ++i)
+        {
+            if (a->palette[i] != b->palette[i]) return 0;
+        }
+    }
+    return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info)
+{
+    if (info->palette) lodepng_free(info->palette);
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    unsigned char* data;
+    /*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+    the max of 256 colors, it'll have the exact alloc size*/
+    if (!info->palette) /*allocate palette if empty*/
+    {
+        /*room for 256 colors with 4 bytes each*/
+        data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+        if (!data) return 83; /*alloc fail*/
+        else info->palette = data;
+    }
+    info->palette[4 * info->palettesize + 0] = r;
+    info->palette[4 * info->palettesize + 1] = g;
+    info->palette[4 * info->palettesize + 2] = b;
+    info->palette[4 * info->palettesize + 3] = a;
+    ++info->palettesize;
+    return 0;
+}
+
+unsigned lodepng_get_bpp(const LodePNGColorMode* info)
+{
+    /*calculate bits per pixel out of colortype and bitdepth*/
+    return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info)
+{
+    return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info)
+{
+    return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info)
+{
+    size_t i;
+    for (i = 0; i != info->palettesize; ++i)
+    {
+        if (info->palette[i * 4 + 3] < 255) return 1;
+    }
+    return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info)
+{
+    return info->key_defined
+        || lodepng_is_alpha_type(info)
+        || lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer*/
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t line = ((w / 8) * bpp) + ((w & 7) * bpp + 7) / 8;
+    return h * line;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src)
+{
+    unsigned i;
+
+    LodePNGUnknownChunks_cleanup(dest);
+
+    for (i = 0; i != 3; ++i)
+    {
+        size_t j;
+        dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+        dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+        if (!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+        for (j = 0; j < src->unknown_chunks_size[i]; ++j)
+        {
+            dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+        }
+    }
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info)
+{
+    info->text_num = 0;
+    info->text_keys = NULL;
+    info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->text_num; ++i)
+    {
+        string_cleanup(&info->text_keys[i]);
+        string_cleanup(&info->text_strings[i]);
+    }
+    lodepng_free(info->text_keys);
+    lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->text_keys = 0;
+    dest->text_strings = 0;
+    dest->text_num = 0;
+    for (i = 0; i != source->text_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info)
+{
+    LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+    if (!new_keys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->text_num;
+    info->text_keys = new_keys;
+    info->text_strings = new_strings;
+
+    string_init(&info->text_keys[info->text_num - 1]);
+    string_set(&info->text_keys[info->text_num - 1], key);
+
+    string_init(&info->text_strings[info->text_num - 1]);
+    string_set(&info->text_strings[info->text_num - 1], str);
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info)
+{
+    info->itext_num = 0;
+    info->itext_keys = NULL;
+    info->itext_langtags = NULL;
+    info->itext_transkeys = NULL;
+    info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->itext_num; ++i)
+    {
+        string_cleanup(&info->itext_keys[i]);
+        string_cleanup(&info->itext_langtags[i]);
+        string_cleanup(&info->itext_transkeys[i]);
+        string_cleanup(&info->itext_strings[i]);
+    }
+    lodepng_free(info->itext_keys);
+    lodepng_free(info->itext_langtags);
+    lodepng_free(info->itext_transkeys);
+    lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->itext_keys = 0;
+    dest->itext_langtags = 0;
+    dest->itext_transkeys = 0;
+    dest->itext_strings = 0;
+    dest->itext_num = 0;
+    for (i = 0; i != source->itext_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+            source->itext_transkeys[i], source->itext_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info)
+{
+    LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+    char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+    if (!new_keys || !new_langtags || !new_transkeys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_langtags);
+        lodepng_free(new_transkeys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->itext_num;
+    info->itext_keys = new_keys;
+    info->itext_langtags = new_langtags;
+    info->itext_transkeys = new_transkeys;
+    info->itext_strings = new_strings;
+
+    string_init(&info->itext_keys[info->itext_num - 1]);
+    string_set(&info->itext_keys[info->itext_num - 1], key);
+
+    string_init(&info->itext_langtags[info->itext_num - 1]);
+    string_set(&info->itext_langtags[info->itext_num - 1], langtag);
+
+    string_init(&info->itext_transkeys[info->itext_num - 1]);
+    string_set(&info->itext_transkeys[info->itext_num - 1], transkey);
+
+    string_init(&info->itext_strings[info->itext_num - 1]);
+    string_set(&info->itext_strings[info->itext_num - 1], str);
+
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info)
+{
+    lodepng_color_mode_init(&info->color);
+    info->interlace_method = 0;
+    info->compression_method = 0;
+    info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    info->background_defined = 0;
+    info->background_r = info->background_g = info->background_b = 0;
+
+    LodePNGText_init(info);
+    LodePNGIText_init(info);
+
+    info->time_defined = 0;
+    info->phys_defined = 0;
+
+    LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info)
+{
+    lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    LodePNGText_cleanup(info);
+    LodePNGIText_cleanup(info);
+
+    LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    lodepng_info_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->color);
+    CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+    CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+
+    LodePNGUnknownChunks_init(dest);
+    CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    return 0;
+}
+
+void lodepng_info_swap(LodePNGInfo* a, LodePNGInfo* b)
+{
+    LodePNGInfo temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in)
+{
+    unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+                                                    /*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+    unsigned p = index & m;
+    in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+    in = in << (bits * (m - p));
+    if (p == 0) out[index * bits / 8] = in;
+    else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree
+{
+    ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+    int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i) tree->children[i] = 0;
+    tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i)
+    {
+        if (tree->children[i])
+        {
+            color_tree_cleanup(tree->children[i]);
+            lodepng_free(tree->children[i]);
+        }
+    }
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    int bit = 0;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i]) return -1;
+        else tree = tree->children[i];
+    }
+    return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index)
+{
+    int bit;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i])
+        {
+            tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+            color_tree_init(tree->children[i]);
+        }
+        tree = tree->children[i];
+    }
+    tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8) out[i] = grey;
+        else if (mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = grey;
+        else
+        {
+            /*take the most significant bits of grey*/
+            grey = (grey >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+            addColorBits(out, i, mode->bitdepth, grey);
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 3 + 0] = r;
+            out[i * 3 + 1] = g;
+            out[i * 3 + 2] = b;
+        }
+        else
+        {
+            out[i * 6 + 0] = out[i * 6 + 1] = r;
+            out[i * 6 + 2] = out[i * 6 + 3] = g;
+            out[i * 6 + 4] = out[i * 6 + 5] = b;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        int index = color_tree_get(tree, r, g, b, a);
+        if (index < 0) return 82; /*color not in palette*/
+        if (mode->bitdepth == 8) out[i] = index;
+        else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8)
+        {
+            out[i * 2 + 0] = grey;
+            out[i * 2 + 1] = a;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            out[i * 4 + 0] = out[i * 4 + 1] = grey;
+            out[i * 4 + 2] = out[i * 4 + 3] = a;
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 4 + 0] = r;
+            out[i * 4 + 1] = g;
+            out[i * 4 + 2] = b;
+            out[i * 4 + 3] = a;
+        }
+        else
+        {
+            out[i * 8 + 0] = out[i * 8 + 1] = r;
+            out[i * 8 + 2] = out[i * 8 + 3] = g;
+            out[i * 8 + 4] = out[i * 8 + 5] = b;
+            out[i * 8 + 6] = out[i * 8 + 7] = a;
+        }
+    }
+
+    return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode,
+    unsigned short r, unsigned short g, unsigned short b, unsigned short a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 2 + 0] = (grey >> 8) & 255;
+        out[i * 2 + 1] = grey & 255;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        out[i * 6 + 0] = (r >> 8) & 255;
+        out[i * 6 + 1] = r & 255;
+        out[i * 6 + 2] = (g >> 8) & 255;
+        out[i * 6 + 3] = g & 255;
+        out[i * 6 + 4] = (b >> 8) & 255;
+        out[i * 6 + 5] = b & 255;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 4 + 0] = (grey >> 8) & 255;
+        out[i * 4 + 1] = grey & 255;
+        out[i * 4 + 2] = (a >> 8) & 255;
+        out[i * 4 + 3] = a & 255;
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        out[i * 8 + 0] = (r >> 8) & 255;
+        out[i * 8 + 1] = r & 255;
+        out[i * 8 + 2] = (g >> 8) & 255;
+        out[i * 8 + 3] = g & 255;
+        out[i * 8 + 4] = (b >> 8) & 255;
+        out[i * 8 + 5] = b & 255;
+        out[i * 8 + 6] = (a >> 8) & 255;
+        out[i * 8 + 7] = a & 255;
+    }
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+    unsigned char* b, unsigned char* a,
+    const unsigned char* in, size_t i,
+    const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i];
+            if (mode->key_defined && *r == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = i * mode->bitdepth;
+            unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+            *r = *g = *b = (value * 255) / highest;
+            if (mode->key_defined && value == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+            if (mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            *r = in[i * 6 + 0];
+            *g = in[i * 6 + 2];
+            *b = in[i * 6 + 4];
+            if (mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        if (mode->bitdepth == 8) index = in[i];
+        else
+        {
+            size_t j = i * mode->bitdepth;
+            index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+        }
+
+        if (index >= mode->palettesize)
+        {
+            /*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+            Done here too, slightly faster due to no error handling needed.*/
+            *r = *g = *b = 0;
+            *a = 255;
+        }
+        else
+        {
+            *r = mode->palette[index * 4 + 0];
+            *g = mode->palette[index * 4 + 1];
+            *b = mode->palette[index * 4 + 2];
+            *a = mode->palette[index * 4 + 3];
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            *a = in[i * 2 + 1];
+        }
+        else
+        {
+            *r = *g = *b = in[i * 4 + 0];
+            *a = in[i * 4 + 2];
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 4 + 0];
+            *g = in[i * 4 + 1];
+            *b = in[i * 4 + 2];
+            *a = in[i * 4 + 3];
+        }
+        else
+        {
+            *r = in[i * 8 + 0];
+            *g = in[i * 8 + 2];
+            *b = in[i * 8 + 4];
+            *a = in[i * 8 + 6];
+        }
+    }
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+    unsigned has_alpha, const unsigned char* in,
+    const LodePNGColorMode* mode)
+{
+    unsigned num_channels = has_alpha ? 4 : 3;
+    size_t i;
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i];
+                if (has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+            }
+        }
+        else if (mode->bitdepth == 16)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+                if (has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+            }
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = 0;
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+                buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+                if (has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 3 + 0];
+                buffer[1] = in[i * 3 + 1];
+                buffer[2] = in[i * 3 + 2];
+                if (has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+                    && buffer[1] == mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 6 + 0];
+                buffer[1] = in[i * 6 + 2];
+                buffer[2] = in[i * 6 + 4];
+                if (has_alpha) buffer[3] = mode->key_defined
+                    && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                    && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                    && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        size_t j = 0;
+        for (i = 0; i != numpixels; ++i, buffer += num_channels)
+        {
+            if (mode->bitdepth == 8) index = in[i];
+            else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+            if (index >= mode->palettesize)
+            {
+                /*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+                Done here too, slightly faster due to no error handling needed.*/
+                buffer[0] = buffer[1] = buffer[2] = 0;
+                if (has_alpha) buffer[3] = 255;
+            }
+            else
+            {
+                buffer[0] = mode->palette[index * 4 + 0];
+                buffer[1] = mode->palette[index * 4 + 1];
+                buffer[2] = mode->palette[index * 4 + 2];
+                if (has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+                if (has_alpha) buffer[3] = in[i * 2 + 1];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+                if (has_alpha) buffer[3] = in[i * 4 + 2];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 4 + 0];
+                buffer[1] = in[i * 4 + 1];
+                buffer[2] = in[i * 4 + 2];
+                if (has_alpha) buffer[3] = in[i * 4 + 3];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 8 + 0];
+                buffer[1] = in[i * 8 + 2];
+                buffer[2] = in[i * 8 + 4];
+                if (has_alpha) buffer[3] = in[i * 8 + 6];
+            }
+        }
+    }
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+    const unsigned char* in, size_t i, const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        *r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+        if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        *r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+        *g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+        *b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+        if (mode->key_defined
+            && 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+            && 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+            && 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        *r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+        *a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        *r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+        *g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+        *b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+        *a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+    }
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h)
+{
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+    unsigned error = 0;
+
+    if (lodepng_color_mode_equal(mode_out, mode_in))
+    {
+        size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+        for (i = 0; i != numbytes; ++i) out[i] = in[i];
+        return 0;
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        size_t palettesize = mode_out->palettesize;
+        const unsigned char* palette = mode_out->palette;
+        size_t palsize = 1u << mode_out->bitdepth;
+        /*if the user specified output palette but did not give the values, assume
+        they want the values of the input color type (assuming that one is palette).
+        Note that we never create a new palette ourselves.*/
+        if (palettesize == 0)
+        {
+            palettesize = mode_in->palettesize;
+            palette = mode_in->palette;
+        }
+        if (palettesize < palsize) palsize = palettesize;
+        color_tree_init(&tree);
+        for (i = 0; i != palsize; ++i)
+        {
+            const unsigned char* p = &palette[i * 4];
+            color_tree_add(&tree, p[0], p[1], p[2], p[3], i);
+        }
+    }
+
+    if (mode_in->bitdepth == 16 && mode_out->bitdepth == 16)
+    {
+        for (i = 0; i != numpixels; ++i)
+        {
+            unsigned short r = 0, g = 0, b = 0, a = 0;
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+            rgba16ToPixel(out, i, mode_out, r, g, b, a);
+        }
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA)
+    {
+        getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB)
+    {
+        getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+    }
+    else
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+            error = rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a);
+            if (error) break;
+        }
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        color_tree_cleanup(&tree);
+    }
+
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile)
+{
+    profile->colored = 0;
+    profile->key = 0;
+    profile->key_r = profile->key_g = profile->key_b = 0;
+    profile->alpha = 0;
+    profile->numcolors = 0;
+    profile->bits = 1;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p)
+{
+std::cout << "colored: " << (int)p->colored << ", ";
+std::cout << "key: " << (int)p->key << ", ";
+std::cout << "key_r: " << (int)p->key_r << ", ";
+std::cout << "key_g: " << (int)p->key_g << ", ";
+std::cout << "key_b: " << (int)p->key_b << ", ";
+std::cout << "alpha: " << (int)p->alpha << ", ";
+std::cout << "numcolors: " << (int)p->numcolors << ", ";
+std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value)
+{
+    if (value == 0 || value == 255) return 1;
+    /*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+    if (value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+    return 8;
+}
+
+/*profile must already have been inited with mode.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* mode)
+{
+    unsigned error = 0;
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+
+    unsigned colored_done = lodepng_is_greyscale_type(mode) ? 1 : 0;
+    unsigned alpha_done = lodepng_can_have_alpha(mode) ? 0 : 1;
+    unsigned numcolors_done = 0;
+    unsigned bpp = lodepng_get_bpp(mode);
+    unsigned bits_done = bpp == 1 ? 1 : 0;
+    unsigned maxnumcolors = 257;
+    unsigned sixteen = 0;
+    if (bpp <= 8) maxnumcolors = bpp == 1 ? 2 : (bpp == 2 ? 4 : (bpp == 4 ? 16 : 256));
+
+    color_tree_init(&tree);
+
+    /*Check if the 16-bit input is truly 16-bit*/
+    if (mode->bitdepth == 16)
+    {
+        unsigned short r, g, b, a;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+            if ((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+                (b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/
+            {
+                sixteen = 1;
+                break;
+            }
+        }
+    }
+
+    if (sixteen)
+    {
+        unsigned short r = 0, g = 0, b = 0, a = 0;
+        profile->bits = 16;
+        bits_done = numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 65535 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 65535 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+        }
+    }
+    else /* < 16-bit */
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+
+            if (!bits_done && profile->bits < 8)
+            {
+                /*only r is checked, < 8 bits is only relevant for greyscale*/
+                unsigned bits = getValueRequiredBits(r);
+                if (bits > profile->bits) profile->bits = bits;
+            }
+            bits_done = (profile->bits >= bpp);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+                if (profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 255 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 255 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+
+            if (!numcolors_done)
+            {
+                if (!color_tree_has(&tree, r, g, b, a))
+                {
+                    color_tree_add(&tree, r, g, b, a, profile->numcolors);
+                    if (profile->numcolors < 256)
+                    {
+                        unsigned char* p = profile->palette;
+                        unsigned n = profile->numcolors;
+                        p[n * 4 + 0] = r;
+                        p[n * 4 + 1] = g;
+                        p[n * 4 + 2] = b;
+                        p[n * 4 + 3] = a;
+                    }
+                    ++profile->numcolors;
+                    numcolors_done = profile->numcolors >= maxnumcolors;
+                }
+            }
+
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+        }
+
+        /*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+        profile->key_r += (profile->key_r << 8);
+        profile->key_g += (profile->key_g << 8);
+        profile->key_b += (profile->key_b << 8);
+    }
+
+    color_tree_cleanup(&tree);
+    return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. grey if there are only greyscale pixels, palette if there
+are less than 256 colors, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in)
+{
+    LodePNGColorProfile prof;
+    unsigned error = 0;
+    unsigned i, n, palettebits, palette_ok;
+
+    lodepng_color_profile_init(&prof);
+    error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+    if (error) return error;
+    mode_out->key_defined = 0;
+
+    if (prof.key && w * h <= 16)
+    {
+        prof.alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+        prof.key = 0;
+        if (prof.bits < 8) prof.bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+    }
+    n = prof.numcolors;
+    palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+    palette_ok = n <= 256 && prof.bits <= 8;
+    if (w * h < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+    if (!prof.colored && prof.bits <= palettebits) palette_ok = 0; /*grey is less overhead*/
+
+    if (palette_ok)
+    {
+        unsigned char* p = prof.palette;
+        lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+        for (i = 0; i != prof.numcolors; ++i)
+        {
+            error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+            if (error) break;
+        }
+
+        mode_out->colortype = LCT_PALETTE;
+        mode_out->bitdepth = palettebits;
+
+        if (mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+            && mode_in->bitdepth == mode_out->bitdepth)
+        {
+            /*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+            lodepng_color_mode_cleanup(mode_out);
+            lodepng_color_mode_copy(mode_out, mode_in);
+        }
+    }
+    else /*8-bit or 16-bit per channel*/
+    {
+        mode_out->bitdepth = prof.bits;
+        mode_out->colortype = prof.alpha ? (prof.colored ? LCT_RGBA : LCT_GREY_ALPHA)
+            : (prof.colored ? LCT_RGB : LCT_GREY);
+
+        if (prof.key)
+        {
+            unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+            mode_out->key_r = prof.key_r & mask;
+            mode_out->key_g = prof.key_g & mask;
+            mode_out->key_b = prof.key_b & mask;
+            mode_out->key_defined = 1;
+        }
+    }
+
+    return error;
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c)
+{
+    short pa = abs(b - c);
+    short pb = abs(a - c);
+    short pc = abs(a + b - c - c);
+
+    if (pc < pa && pc < pb) return (unsigned char)c;
+    else if (pb < pa) return (unsigned char)b;
+    else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+                                                             /*
+                                                             Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+                                                             passw: output containing the width of the 7 passes
+                                                             passh: output containing the height of the 7 passes
+                                                             filter_passstart: output containing the index of the start and end of each
+                                                             reduced image with filter bytes
+                                                             padded_passstart output containing the index of the start and end of each
+                                                             reduced image when without filter bytes but with padded scanlines
+                                                             passstart: output containing the index of the start and end of each reduced
+                                                             image without padding between scanlines, but still padding between the images
+                                                             w, h: width and height of non-interlaced image
+                                                             bpp: bits per pixel
+                                                             "padded" is only relevant if bpp is less than 8 and a scanline or image does not
+                                                             end at a full byte
+                                                             */
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+    size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp)
+{
+    /*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+    unsigned i;
+
+    /*calculate width and height in pixels of each pass*/
+    for (i = 0; i != 7; ++i)
+    {
+        passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+        passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+        if (passw[i] == 0) passh[i] = 0;
+        if (passh[i] == 0) passw[i] = 0;
+    }
+
+    filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+    for (i = 0; i != 7; ++i)
+    {
+        /*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+        filter_passstart[i + 1] = filter_passstart[i]
+            + ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+        /*bits padded if needed to fill full byte at end of each scanline*/
+        padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+        /*only padded at end of reduced image*/
+        passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+    }
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    LodePNGInfo* info = &state->info_png;
+    if (insize == 0 || in == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+    }
+    if (insize < 33)
+    {
+        CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+    }
+
+    /*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+    lodepng_info_cleanup(info);
+    lodepng_info_init(info);
+
+    if (in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+        || in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10)
+    {
+        CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+    }
+    if (lodepng_chunk_length(in + 8) != 13)
+    {
+        CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+    }
+    if (!lodepng_chunk_type_equals(in + 8, "IHDR"))
+    {
+        CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+    }
+
+    /*read the values given in the header*/
+    *w = lodepng_read32bitInt(&in[16]);
+    *h = lodepng_read32bitInt(&in[20]);
+    info->color.bitdepth = in[24];
+    info->color.colortype = (LodePNGColorType)in[25];
+    info->compression_method = in[26];
+    info->filter_method = in[27];
+    info->interlace_method = in[28];
+
+    if (*w == 0 || *h == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 93);
+    }
+
+    if (!state->decoder.ignore_crc)
+    {
+        unsigned CRC = lodepng_read32bitInt(&in[29]);
+        unsigned checksum = lodepng_crc32(&in[12], 17);
+        if (CRC != checksum)
+        {
+            CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+        }
+    }
+
+    /*error: only compression method 0 is allowed in the specification*/
+    if (info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+    /*error: only filter method 0 is allowed in the specification*/
+    if (info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+    /*error: only interlace methods 0 and 1 exist in the specification*/
+    if (info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+    state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+    return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+    size_t bytewidth, unsigned char filterType, size_t length)
+{
+    /*
+    For PNG filter method 0
+    unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+    the filter works byte per byte (bytewidth = 1)
+    precon is the previous unfiltered scanline, recon the result, scanline the current one
+    the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+    recon and scanline MAY be the same memory address! precon must be disjoint.
+    */
+
+    size_t i;
+    switch (filterType)
+    {
+    case 0:
+        for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        break;
+    case 1:
+        for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+        break;
+    case 2:
+        if (precon)
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        }
+        break;
+    case 3:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+        }
+        break;
+    case 4:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = scanline[i];
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                /*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+                recon[i] = (scanline[i] + recon[i - bytewidth]);
+            }
+        }
+        break;
+    default: return 36; /*error: unexisting filter type given*/
+    }
+    return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    /*
+    For PNG filter method 0
+    this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+    out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+    w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+    in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+    */
+
+    unsigned y;
+    unsigned char* prevline = 0;
+
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    size_t linebytes = (w * bpp + 7) / 8;
+
+    for (y = 0; y < h; ++y)
+    {
+        size_t outindex = linebytes * y;
+        size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+        unsigned char filterType = in[inindex];
+
+        CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+        prevline = &out[outindex];
+    }
+
+    return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        /*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+                        setBitOfReversedStream0(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*
+    After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+    to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+    for the Adam7 code, the color convert code and the output to the user.
+    in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+    have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+    also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+    only useful if (ilinebits - olinebits) is a value in the range 1..7
+    */
+    unsigned y;
+    size_t diff = ilinebits - olinebits;
+    size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+    for (y = 0; y < h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < olinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        ibp += diff;
+    }
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+    unsigned w, unsigned h, const LodePNGInfo* info_png)
+{
+    /*
+    This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+    Steps:
+    *) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+    *) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+    NOTE: the in buffer will be overwritten with intermediate data!
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    if (bpp == 0) return 31; /*error: invalid colortype*/
+
+    if (info_png->interlace_method == 0)
+    {
+        if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+        {
+            CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+            removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+        }
+        /*we can immediately filter into the out buffer, no other steps needed*/
+        else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned i;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        for (i = 0; i != 7; ++i)
+        {
+            CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+            /*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+            move bytes instead of bits or move not at all*/
+            if (bpp < 8)
+            {
+                /*remove padding bits in scanlines; after this there still may be padding
+                bits between the different reduced images: each reduced image still starts nicely at a byte*/
+                removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+                    ((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+            }
+        }
+
+        Adam7_deinterlace(out, in, w, h, bpp);
+    }
+
+    return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned pos = 0, i;
+    if (color->palette) lodepng_free(color->palette);
+    color->palettesize = chunkLength / 3;
+    color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+    if (!color->palette && color->palettesize)
+    {
+        color->palettesize = 0;
+        return 83; /*alloc fail*/
+    }
+    if (color->palettesize > 256) return 38; /*error: palette too big*/
+
+    for (i = 0; i != color->palettesize; ++i)
+    {
+        color->palette[4 * i + 0] = data[pos++]; /*R*/
+        color->palette[4 * i + 1] = data[pos++]; /*G*/
+        color->palette[4 * i + 2] = data[pos++]; /*B*/
+        color->palette[4 * i + 3] = 255; /*alpha*/
+    }
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned i;
+    if (color->colortype == LCT_PALETTE)
+    {
+        /*error: more alpha values given than there are palette entries*/
+        if (chunkLength > color->palettesize) return 38;
+
+        for (i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+    }
+    else if (color->colortype == LCT_GREY)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 30;
+
+        color->key_defined = 1;
+        color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+    }
+    else if (color->colortype == LCT_RGB)
+    {
+        /*error: this chunk must be 6 bytes for RGB image*/
+        if (chunkLength != 6) return 41;
+
+        color->key_defined = 1;
+        color->key_r = 256u * data[0] + data[1];
+        color->key_g = 256u * data[2] + data[3];
+        color->key_b = 256u * data[4] + data[5];
+    }
+    else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+    return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (info->color.colortype == LCT_PALETTE)
+    {
+        /*error: this chunk must be 1 byte for indexed color image*/
+        if (chunkLength != 1) return 43;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = data[0];
+    }
+    else if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 44;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        /*error: this chunk must be 6 bytes for greyscale image*/
+        if (chunkLength != 6) return 45;
+
+        info->background_defined = 1;
+        info->background_r = 256u * data[0] + data[1];
+        info->background_g = 256u * data[2] + data[3];
+        info->background_b = 256u * data[4] + data[5];
+    }
+
+    return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    char *key = 0, *str = 0;
+    unsigned i;
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        unsigned length, string2_begin;
+
+        length = 0;
+        while (length < chunkLength && data[length] != 0) ++length;
+        /*even though it's not allowed by the standard, no error is thrown if
+        there's no null termination char, if the text is empty*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        string2_begin = length + 1; /*skip keyword null terminator*/
+
+        length = chunkLength < string2_begin ? 0 : chunkLength - string2_begin;
+        str = (char*)lodepng_malloc(length + 1);
+        if (!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        str[length] = 0;
+        for (i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+        error = lodepng_add_text(info, key, str);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(str);
+
+    return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, string2_begin;
+    char *key = 0;
+    ucvector decoded;
+
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        if (data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+        string2_begin = length + 2;
+        if (string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+        length = chunkLength - string2_begin;
+        /*will fail if zlib error, e.g. if length is too small*/
+        error = zlib_decompress(&decoded.data, &decoded.size,
+            (unsigned char*)(&data[string2_begin]),
+            length, zlibsettings);
+        if (error) break;
+        ucvector_push_back(&decoded, 0);
+
+        error = lodepng_add_text(info, key, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, begin, compressed;
+    char *key = 0, *langtag = 0, *transkey = 0;
+    ucvector decoded;
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        /*Quick check if the chunk length isn't too small. Even without check
+        it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+        if (chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+                                                      /*read the key*/
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        /*read the compression method*/
+        compressed = data[length + 1];
+        if (data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+                                                            /*even though it's not allowed by the standard, no error is thrown if
+                                                            there's no null termination char, if the text is empty for the next 3 texts*/
+
+                                                            /*read the langtag*/
+        begin = length + 3;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        langtag = (char*)lodepng_malloc(length + 1);
+        if (!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        langtag[length] = 0;
+        for (i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+        /*read the transkey*/
+        begin += length + 1;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        transkey = (char*)lodepng_malloc(length + 1);
+        if (!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        transkey[length] = 0;
+        for (i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+        /*read the actual text*/
+        begin += length + 1;
+
+        length = chunkLength < begin ? 0 : chunkLength - begin;
+
+        if (compressed)
+        {
+            /*will fail if zlib error, e.g. if length is too small*/
+            error = zlib_decompress(&decoded.data, &decoded.size,
+                (unsigned char*)(&data[begin]),
+                length, zlibsettings);
+            if (error) break;
+            if (decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+            ucvector_push_back(&decoded, 0);
+        }
+        else
+        {
+            if (!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+            decoded.data[length] = 0;
+            for (i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+        }
+
+        error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(langtag);
+    lodepng_free(transkey);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+    info->time_defined = 1;
+    info->time.year = 256u * data[0] + data[1];
+    info->time.month = data[2];
+    info->time.day = data[3];
+    info->time.hour = data[4];
+    info->time.minute = data[5];
+    info->time.second = data[6];
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+    info->phys_defined = 1;
+    info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+    info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+    info->phys_unit = data[8];
+
+    return 0; /* OK */
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    unsigned char IEND = 0;
+    const unsigned char* chunk;
+    size_t i;
+    ucvector idat; /*the data from idat chunks*/
+    ucvector scanlines;
+    size_t predict;
+    size_t numpixels;
+    size_t outsize = 0;
+
+    /*for unknown chunk order*/
+    unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                               /*provide some proper output values if error will happen*/
+    *out = 0;
+
+    state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+    if (state->error) return;
+
+    numpixels = *w * *h;
+
+    /*multiplication overflow*/
+    if (*h != 0 && numpixels / *h != *w) CERROR_RETURN(state->error, 92);
+    /*multiplication overflow possible further below. Allows up to 2^31-1 pixel
+    bytes with 16-bit RGBA, the rest is room for filter bytes.*/
+    if (numpixels > 268435455) CERROR_RETURN(state->error, 92);
+
+    ucvector_init(&idat);
+    chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+                     /*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+                     IDAT data is put at the start of the in buffer*/
+    while (!IEND && !state->error)
+    {
+        unsigned chunkLength;
+        const unsigned char* data; /*the data in the chunk*/
+
+                                   /*error: size of the in buffer too small to contain next chunk*/
+        if ((size_t)((chunk - in) + 12) > insize || chunk < in) CERROR_BREAK(state->error, 30);
+
+        /*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+        chunkLength = lodepng_chunk_length(chunk);
+        /*error: chunk length larger than the max PNG chunk size*/
+        if (chunkLength > 2147483647) CERROR_BREAK(state->error, 63);
+
+        if ((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in)
+        {
+            CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+        }
+
+        data = lodepng_chunk_data_const(chunk);
+
+        /*IDAT chunk, containing compressed image data*/
+        if (lodepng_chunk_type_equals(chunk, "IDAT"))
+        {
+            size_t oldsize = idat.size;
+            if (!ucvector_resize(&idat, oldsize + chunkLength)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+            for (i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*IEND chunk*/
+        else if (lodepng_chunk_type_equals(chunk, "IEND"))
+        {
+            IEND = 1;
+        }
+        /*palette chunk (PLTE)*/
+        else if (lodepng_chunk_type_equals(chunk, "PLTE"))
+        {
+            state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*palette transparency chunk (tRNS)*/
+        else if (lodepng_chunk_type_equals(chunk, "tRNS"))
+        {
+            state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*background color chunk (bKGD)*/
+        else if (lodepng_chunk_type_equals(chunk, "bKGD"))
+        {
+            state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        /*text chunk (tEXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "tEXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*compressed text chunk (zTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "zTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*international text chunk (iTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "iTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        else if (lodepng_chunk_type_equals(chunk, "tIME"))
+        {
+            state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        else if (lodepng_chunk_type_equals(chunk, "pHYs"))
+        {
+            state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        else /*it's not an implemented chunk type, so ignore it: skip over the data*/
+        {
+            /*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+            if (!lodepng_chunk_ancillary(chunk)) CERROR_BREAK(state->error, 69);
+
+            unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            if (state->decoder.remember_unknown_chunks)
+            {
+                state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+                    &state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+                if (state->error) break;
+            }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+
+        if (!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/
+        {
+            if (lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+        }
+
+        if (!IEND) chunk = lodepng_chunk_next_const(chunk);
+    }
+
+    ucvector_init(&scanlines);
+    /*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+    If the decompressed size does not match the prediction, the image must be corrupt.*/
+    if (state->info_png.interlace_method == 0)
+    {
+        /*The extra *h is added because this are the filter bytes every scanline starts with*/
+        predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color) + *h;
+    }
+    else
+    {
+        /*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+        const LodePNGColorMode* color = &state->info_png.color;
+        predict = 0;
+        predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        if (*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color) + ((*h + 3) >> 3);
+        if (*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color) + ((*h + 3) >> 2);
+        predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color) + ((*h + 1) >> 2);
+        if (*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color) + ((*h + 1) >> 1);
+        predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color) + ((*h + 0) >> 1);
+    }
+    if (!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+    if (!state->error)
+    {
+        state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+            idat.size, &state->decoder.zlibsettings);
+        if (!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+    }
+    ucvector_cleanup(&idat);
+
+    if (!state->error)
+    {
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!*out) state->error = 83; /*alloc fail*/
+    }
+    if (!state->error)
+    {
+        for (i = 0; i < outsize; i++) (*out)[i] = 0;
+        state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+    }
+    ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    *out = 0;
+    decodeGeneric(out, w, h, state, in, insize);
+    if (state->error) return state->error;
+    if (!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color))
+    {
+        /*same color type, no copying or converting of data needed*/
+        /*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+        the raw image has to the end user*/
+        if (!state->decoder.color_convert)
+        {
+            state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+            if (state->error) return state->error;
+        }
+    }
+    else
+    {
+        /*color conversion needed; sort of copy of the data*/
+        unsigned char* data = *out;
+        size_t outsize;
+
+        /*TODO: check if this works according to the statement in the documentation: "The converter can convert
+        from greyscale input color type, to 8-bit greyscale or greyscale with alpha"*/
+        if (!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+            && !(state->info_raw.bitdepth == 8))
+        {
+            return 56; /*unsupported color mode conversion*/
+        }
+
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!(*out))
+        {
+            state->error = 83; /*alloc fail*/
+        }
+        else state->error = lodepng_convert(*out, data, &state->info_raw,
+            &state->info_png.color, *w, *h);
+        lodepng_free(data);
+    }
+    return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+    size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    error = lodepng_decode(out, w, h, &state, in, insize);
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer = 0;
+    size_t buffersize;
+    unsigned error;
+    error = lodepng_load_file(&buffer, &buffersize, filename);
+    if (!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings)
+{
+    settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->read_text_chunks = 1;
+    settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    settings->ignore_crc = 0;
+    lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state)
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    lodepng_color_mode_init(&state->info_raw);
+    lodepng_info_init(&state->info_png);
+    state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state)
+{
+    lodepng_color_mode_cleanup(&state->info_raw);
+    lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source)
+{
+    lodepng_state_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->info_raw);
+    lodepng_info_init(&dest->info_png);
+    dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if (dest->error) return;
+    dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if (dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length)
+{
+    CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+    out->allocsize = out->size; /*fix the allocsize again*/
+    return 0;
+}
+
+static void writeSignature(ucvector* out)
+{
+    /*8 bytes PNG signature, aka the magic bytes*/
+    ucvector_push_back(out, 137);
+    ucvector_push_back(out, 80);
+    ucvector_push_back(out, 78);
+    ucvector_push_back(out, 71);
+    ucvector_push_back(out, 13);
+    ucvector_push_back(out, 10);
+    ucvector_push_back(out, 26);
+    ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method)
+{
+    unsigned error = 0;
+    ucvector header;
+    ucvector_init(&header);
+
+    lodepng_add32bitInt(&header, w); /*width*/
+    lodepng_add32bitInt(&header, h); /*height*/
+    ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+    ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+    ucvector_push_back(&header, 0); /*compression method*/
+    ucvector_push_back(&header, 0); /*filter method*/
+    ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+    error = addChunk(out, "IHDR", header.data, header.size);
+    ucvector_cleanup(&header);
+
+    return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector PLTE;
+    ucvector_init(&PLTE);
+    for (i = 0; i != info->palettesize * 4; ++i)
+    {
+        /*add all channels except alpha channel*/
+        if (i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+    }
+    error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+    ucvector_cleanup(&PLTE);
+
+    return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector tRNS;
+    ucvector_init(&tRNS);
+    if (info->colortype == LCT_PALETTE)
+    {
+        size_t amount = info->palettesize;
+        /*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+        for (i = info->palettesize; i != 0; --i)
+        {
+            if (info->palette[4 * (i - 1) + 3] == 255) --amount;
+            else break;
+        }
+        /*add only alpha channel*/
+        for (i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+    }
+    else if (info->colortype == LCT_GREY)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+        }
+    }
+    else if (info->colortype == LCT_RGB)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+        }
+    }
+
+    error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+    ucvector_cleanup(&tRNS);
+
+    return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+    LodePNGCompressSettings* zlibsettings)
+{
+    ucvector zlibdata;
+    unsigned error = 0;
+
+    /*compress with the Zlib compressor*/
+    ucvector_init(&zlibdata);
+    error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+    if (!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+    ucvector_cleanup(&zlibdata);
+
+    return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out)
+{
+    unsigned error = 0;
+    error = addChunk(out, "IEND", 0, 0);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector text;
+    ucvector_init(&text);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&text, 0); /*0 termination char*/
+    for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+    error = addChunk(out, "tEXt", text.data, text.size);
+    ucvector_cleanup(&text);
+
+    return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+    LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data, compressed;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+    ucvector_init(&compressed);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*0 termination char*/
+    ucvector_push_back(&data, 0); /*compression method: 0*/
+
+    error = zlib_compress(&compressed.data, &compressed.size,
+        (unsigned char*)textstring, textsize, zlibsettings);
+    if (!error)
+    {
+        for (i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+        error = addChunk(out, "zTXt", data.data, data.size);
+    }
+
+    ucvector_cleanup(&compressed);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+    const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*null termination char*/
+    ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+    ucvector_push_back(&data, 0); /*compression method*/
+    for (i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+    for (i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+
+    if (compressed)
+    {
+        ucvector compressed_data;
+        ucvector_init(&compressed_data);
+        error = zlib_compress(&compressed_data.data, &compressed_data.size,
+            (unsigned char*)textstring, textsize, zlibsettings);
+        if (!error)
+        {
+            for (i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+        }
+        ucvector_cleanup(&compressed_data);
+    }
+    else /*not compressed*/
+    {
+        for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+    }
+
+    if (!error) error = addChunk(out, "iTXt", data.data, data.size);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector bKGD;
+    ucvector_init(&bKGD);
+    if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+    }
+    else if (info->color.colortype == LCT_PALETTE)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+    }
+
+    error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+    ucvector_cleanup(&bKGD);
+
+    return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time)
+{
+    unsigned error = 0;
+    unsigned char* data = (unsigned char*)lodepng_malloc(7);
+    if (!data) return 83; /*alloc fail*/
+    data[0] = (unsigned char)(time->year >> 8);
+    data[1] = (unsigned char)(time->year & 255);
+    data[2] = (unsigned char)time->month;
+    data[3] = (unsigned char)time->day;
+    data[4] = (unsigned char)time->hour;
+    data[5] = (unsigned char)time->minute;
+    data[6] = (unsigned char)time->second;
+    error = addChunk(out, "tIME", data, 7);
+    lodepng_free(data);
+    return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector data;
+    ucvector_init(&data);
+
+    lodepng_add32bitInt(&data, info->phys_x);
+    lodepng_add32bitInt(&data, info->phys_y);
+    ucvector_push_back(&data, info->phys_unit);
+
+    error = addChunk(out, "pHYs", data.data, data.size);
+    ucvector_cleanup(&data);
+
+    return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+    size_t length, size_t bytewidth, unsigned char filterType)
+{
+    size_t i;
+    switch (filterType)
+    {
+    case 0: /*None*/
+        for (i = 0; i != length; ++i) out[i] = scanline[i];
+        break;
+    case 1: /*Sub*/
+        for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+        break;
+    case 2: /*Up*/
+        if (prevline)
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i];
+        }
+        break;
+    case 3: /*Average*/
+        if (prevline)
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+        }
+        break;
+    case 4: /*Paeth*/
+        if (prevline)
+        {
+            /*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+            for (i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+            for (i = bytewidth; i < length; ++i)
+            {
+                out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            /*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+            for (i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+        }
+        break;
+    default: return; /*unexisting filter type given*/
+    }
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f)
+{
+    float result = 0;
+    while (f > 32) { result += 4; f /= 16; }
+    while (f > 2) { ++result; f /= 2; }
+    return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* info, const LodePNGEncoderSettings* settings)
+{
+    /*
+    For PNG filter method 0
+    out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+    the scanlines with 1 extra byte per scanline
+    */
+
+    unsigned bpp = lodepng_get_bpp(info);
+    /*the width of a scanline in bytes, not including the filter type*/
+    size_t linebytes = (w * bpp + 7) / 8;
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    const unsigned char* prevline = 0;
+    unsigned x, y;
+    unsigned error = 0;
+    LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+    /*
+    There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+    *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+    use fixed filtering, with the filter None).
+    * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+    not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+    all five filters and select the filter that produces the smallest sum of absolute values per row.
+    This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+    If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+    but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+    heuristic is used.
+    */
+    if (settings->filter_palette_zero &&
+        (info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+    if (bpp == 0) return 31; /*error: invalid color type*/
+
+    if (strategy == LFS_ZERO)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            out[outindex] = 0; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_MINSUM)
+    {
+        /*adaptive filtering*/
+        size_t sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned char type, bestType = 0;
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        if (!error)
+        {
+            for (y = 0; y != h; ++y)
+            {
+                /*try the 5 filter types*/
+                for (type = 0; type != 5; ++type)
+                {
+                    filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+                    /*calculate the sum of the result*/
+                    sum[type] = 0;
+                    if (type == 0)
+                    {
+                        for (x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+                    }
+                    else
+                    {
+                        for (x = 0; x != linebytes; ++x)
+                        {
+                            /*For differences, each byte should be treated as signed, values above 127 are negative
+                            (converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+                            This means filtertype 0 is almost never chosen, but that is justified.*/
+                            unsigned char s = attempt[type][x];
+                            sum[type] += s < 128 ? s : (255U - s);
+                        }
+                    }
+
+                    /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                    if (type == 0 || sum[type] < smallest)
+                    {
+                        bestType = type;
+                        smallest = sum[type];
+                    }
+                }
+
+                prevline = &in[y * linebytes];
+
+                /*now fill the out values*/
+                out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+                for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+            }
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_ENTROPY)
+    {
+        float sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        float smallest = 0;
+        unsigned type, bestType = 0;
+        unsigned count[256];
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        for (y = 0; y != h; ++y)
+        {
+            /*try the 5 filter types*/
+            for (type = 0; type != 5; ++type)
+            {
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                for (x = 0; x != 256; ++x) count[x] = 0;
+                for (x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+                ++count[type]; /*the filter type itself is part of the scanline*/
+                sum[type] = 0;
+                for (x = 0; x != 256; ++x)
+                {
+                    float p = count[x] / (float)(linebytes + 1);
+                    sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+                }
+                /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || sum[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = sum[type];
+                }
+            }
+
+            prevline = &in[y * linebytes];
+
+            /*now fill the out values*/
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_PREDEFINED)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            unsigned char type = settings->predefined_filters[y];
+            out[outindex] = type; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_BRUTE_FORCE)
+    {
+        /*brute force filter chooser.
+        deflate the scanline after every filter attempt to see which one deflates best.
+        This is very slow and gives only slightly smaller, sometimes even larger, result*/
+        size_t size[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned type = 0, bestType = 0;
+        unsigned char* dummy;
+        LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+        /*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+        to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+        better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+        cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+        zlibsettings.btype = 1;
+        /*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+        images only, so disable it*/
+        zlibsettings.custom_zlib = 0;
+        zlibsettings.custom_deflate = 0;
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+        for (y = 0; y != h; ++y) /*try the 5 filter types*/
+        {
+            for (type = 0; type != 5; ++type)
+            {
+                unsigned testsize = linebytes;
+                /*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                size[type] = 0;
+                dummy = 0;
+                zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+                lodepng_free(dummy);
+                /*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || size[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = size[type];
+                }
+            }
+            prevline = &in[y * linebytes];
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else return 88; /* unknown filter strategy */
+
+    return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*The opposite of the removePaddingBits function
+    olinebits must be >= ilinebits*/
+    unsigned y;
+    size_t diff = olinebits - ilinebits;
+    size_t obp = 0, ibp = 0; /*bit pointers*/
+    for (y = 0; y != h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < ilinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        /*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+        "Use of uninitialised value of size ###" warning from valgrind*/
+        for (x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+    }
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+no padding bits between scanlines, but between reduced images so that each
+reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        setBitOfReversedStream(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+    unsigned w, unsigned h,
+    const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings)
+{
+    /*
+    This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+    *) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+    *) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    unsigned error = 0;
+
+    if (info_png->interlace_method == 0)
+    {
+        *outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            /*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+            if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+            {
+                unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+                if (!padded) error = 83; /*alloc fail*/
+                if (!error)
+                {
+                    addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+                    error = filter(*out, padded, w, h, &info_png->color, settings);
+                }
+                lodepng_free(padded);
+            }
+            else
+            {
+                /*we can immediately filter into the out buffer, no other steps needed*/
+                error = filter(*out, in, w, h, &info_png->color, settings);
+            }
+        }
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7];
+        size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned char* adam7;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        *outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out)) error = 83; /*alloc fail*/
+
+        adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+        if (!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            unsigned i;
+
+            Adam7_interlace(adam7, in, w, h, bpp);
+            for (i = 0; i != 7; ++i)
+            {
+                if (bpp < 8)
+                {
+                    unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+                    if (!padded) ERROR_BREAK(83); /*alloc fail*/
+                    addPaddingBits(padded, &adam7[passstart[i]],
+                        ((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+                    error = filter(&(*out)[filter_passstart[i]], padded,
+                        passw[i], passh[i], &info_png->color, settings);
+                    lodepng_free(padded);
+                }
+                else
+                {
+                    error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+                        passw[i], passh[i], &info_png->color, settings);
+                }
+
+                if (error) break;
+            }
+        }
+
+        lodepng_free(adam7);
+    }
+
+    return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize)
+{
+    size_t i;
+    unsigned key = 0;
+    unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+    for (i = 0; i != palettesize; ++i)
+    {
+        if (!key && palette[4 * i + 3] == 0)
+        {
+            r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+            key = 1;
+            i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+        }
+        else if (palette[4 * i + 3] != 255) return 2;
+        /*when key, no opaque RGB may have key's RGB*/
+        else if (key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+    }
+    return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize)
+{
+    unsigned char* inchunk = data;
+    while ((size_t)(inchunk - data) < datasize)
+    {
+        CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+        out->allocsize = out->size; /*fix the allocsize again*/
+        inchunk = lodepng_chunk_next(inchunk);
+    }
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state)
+{
+    LodePNGInfo info;
+    ucvector outv;
+    unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+    size_t datasize = 0;
+
+    /*provide some proper output values if error will happen*/
+    *out = 0;
+    *outsize = 0;
+    state->error = 0;
+
+    /*check input values validity*/
+    if ((state->info_png.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+        && (state->info_png.color.palettesize == 0 || state->info_png.color.palettesize > 256))
+    {
+        CERROR_RETURN_ERROR(state->error, 68); /*invalid palette size, it is only allowed to be 1-256*/
+    }
+    if (state->encoder.zlibsettings.btype > 2)
+    {
+        CERROR_RETURN_ERROR(state->error, 61); /*error: unexisting btype*/
+    }
+    if (state->info_png.interlace_method > 1)
+    {
+        CERROR_RETURN_ERROR(state->error, 71); /*error: unexisting interlace mode*/
+    }
+    state->error = checkColorValidity(state->info_png.color.colortype, state->info_png.color.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+    state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+
+                                           /* color convert and compute scanline filter types */
+    lodepng_info_init(&info);
+    lodepng_info_copy(&info, &state->info_png);
+    if (state->encoder.auto_convert)
+    {
+        state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+    }
+    if (!state->error)
+    {
+        if (!lodepng_color_mode_equal(&state->info_raw, &info.color))
+        {
+            unsigned char* converted;
+            size_t size = (w * h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+            converted = (unsigned char*)lodepng_malloc(size);
+            if (!converted && size) state->error = 83; /*alloc fail*/
+            if (!state->error)
+            {
+                state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+            }
+            if (!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+            lodepng_free(converted);
+        }
+        else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+    }
+
+    /* output all PNG chunks */
+    ucvector_init(&outv);
+    while (!state->error) /*while only executed once, to break on error*/
+    {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*write signature and chunks*/
+        writeSignature(&outv);
+        /*IHDR*/
+        addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*unknown chunks between IHDR and PLTE*/
+        if (info.unknown_chunks_data[0])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*PLTE*/
+        if (info.color.colortype == LCT_PALETTE)
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        if (state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA))
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        /*tRNS*/
+        if (info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+        if ((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*bKGD (must come between PLTE and the IDAt chunks*/
+        if (info.background_defined) addChunk_bKGD(&outv, &info);
+        /*pHYs (must come before the IDAT chunks)*/
+        if (info.phys_defined) addChunk_pHYs(&outv, &info);
+
+        /*unknown chunks between PLTE and IDAT*/
+        if (info.unknown_chunks_data[1])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*IDAT (multiple IDAT chunks must be consecutive)*/
+        state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+        if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*tIME*/
+        if (info.time_defined) addChunk_tIME(&outv, &info.time);
+        /*tEXt and/or zTXt*/
+        for (i = 0; i != info.text_num; ++i)
+        {
+            if (strlen(info.text_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.text_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            if (state->encoder.text_compression)
+            {
+                addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+            }
+            else
+            {
+                addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+            }
+        }
+        /*LodePNG version id in text chunk*/
+        if (state->encoder.add_id)
+        {
+            unsigned alread_added_id_text = 0;
+            for (i = 0; i != info.text_num; ++i)
+            {
+                if (!strcmp(info.text_keys[i], "LodePNG"))
+                {
+                    alread_added_id_text = 1;
+                    break;
+                }
+            }
+            if (alread_added_id_text == 0)
+            {
+                addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+            }
+        }
+        /*iTXt*/
+        for (i = 0; i != info.itext_num; ++i)
+        {
+            if (strlen(info.itext_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.itext_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            addChunk_iTXt(&outv, state->encoder.text_compression,
+                info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+                &state->encoder.zlibsettings);
+        }
+
+        /*unknown chunks between IDAT and IEND*/
+        if (info.unknown_chunks_data[2])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        addChunk_IEND(&outv);
+
+        break; /*this isn't really a while loop; no error happened so break out now!*/
+    }
+
+    lodepng_info_cleanup(&info);
+    lodepng_free(data);
+    /*instead of cleaning the vector up, give it to the output*/
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+    unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    state.info_png.color.colortype = colortype;
+    state.info_png.color.bitdepth = bitdepth;
+    lodepng_encode(out, outsize, image, w, h, &state);
+    error = state.error;
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer;
+    size_t buffersize;
+    unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+    if (!error) error = lodepng_save_file(buffer, buffersize, filename);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings)
+{
+    lodepng_compress_settings_init(&settings->zlibsettings);
+    settings->filter_palette_zero = 1;
+    settings->filter_strategy = LFS_MINSUM;
+    settings->auto_convert = 1;
+    settings->force_palette = 0;
+    settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->add_id = 0;
+    settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code)
+{
+    switch (code)
+    {
+    case 0: return "no error, everything went ok";
+    case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+    case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+    case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+    case 13: return "problem while processing dynamic deflate block";
+    case 14: return "problem while processing dynamic deflate block";
+    case 15: return "problem while processing dynamic deflate block";
+    case 16: return "unexisting code while processing dynamic deflate block";
+    case 17: return "end of out buffer memory reached while inflating";
+    case 18: return "invalid distance code while inflating";
+    case 19: return "end of out buffer memory reached while inflating";
+    case 20: return "invalid deflate block BTYPE encountered while decoding";
+    case 21: return "NLEN is not ones complement of LEN in a deflate block";
+        /*end of out buffer memory reached while inflating:
+        This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+        all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+        happen in a normal, well encoded, PNG image.*/
+    case 22: return "end of out buffer memory reached while inflating";
+    case 23: return "end of in buffer memory reached while inflating";
+    case 24: return "invalid FCHECK in zlib header";
+    case 25: return "invalid compression method in zlib header";
+    case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+    case 27: return "PNG file is smaller than a PNG header";
+        /*Checks the magic file header, the first 8 bytes of the PNG file*/
+    case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+    case 29: return "first chunk is not the header chunk";
+    case 30: return "chunk length too large, chunk broken off at end of file";
+    case 31: return "illegal PNG color type or bpp";
+    case 32: return "illegal PNG compression method";
+    case 33: return "illegal PNG filter method";
+    case 34: return "illegal PNG interlace method";
+    case 35: return "chunk length of a chunk is too large or the chunk too small";
+    case 36: return "illegal PNG filter type encountered";
+    case 37: return "illegal bit depth for this color type given";
+    case 38: return "the palette is too big"; /*more than 256 colors*/
+    case 39: return "more palette alpha values given in tRNS chunk than there are colors in the palette";
+    case 40: return "tRNS chunk has wrong size for greyscale image";
+    case 41: return "tRNS chunk has wrong size for RGB image";
+    case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+    case 43: return "bKGD chunk has wrong size for palette image";
+    case 44: return "bKGD chunk has wrong size for greyscale image";
+    case 45: return "bKGD chunk has wrong size for RGB image";
+    case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+    case 49: return "jumped past memory while generating dynamic huffman tree";
+    case 50: return "jumped past memory while generating dynamic huffman tree";
+    case 51: return "jumped past memory while inflating huffman block";
+    case 52: return "jumped past memory while inflating";
+    case 53: return "size of zlib data too small";
+    case 54: return "repeat symbol in tree while there was no value symbol yet";
+        /*jumped past tree while generating huffman tree, this could be when the
+        tree will have more leaves than symbols after generating it out of the
+        given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+    case 55: return "jumped past tree while generating huffman tree";
+    case 56: return "given output image colortype or bitdepth not supported for color conversion";
+    case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+    case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+    case 59: return "requested color conversion not supported";
+    case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+    case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+        /*LodePNG leaves the choice of RGB to greyscale conversion formula to the user.*/
+    case 62: return "conversion from color to greyscale not supported";
+    case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk"; /*(2^31-1)*/
+                                                                                                     /*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+    case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+    case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+    case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+    case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+    case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+    case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+    case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+    case 73: return "invalid tIME chunk size";
+    case 74: return "invalid pHYs chunk size";
+        /*length could be wrong, or data chopped off*/
+    case 75: return "no null termination char found while decoding text chunk";
+    case 76: return "iTXt chunk too short to contain required bytes";
+    case 77: return "integer overflow in buffer size";
+    case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+    case 79: return "failed to open file for writing";
+    case 80: return "tried creating a tree of 0 symbols";
+    case 81: return "lazy matching at pos 0 is impossible";
+    case 82: return "color conversion to palette requested while a color isn't in palette";
+    case 83: return "memory allocation failed";
+    case 84: return "given image too small to contain all pixels to be encoded";
+    case 86: return "impossible offset in lz77 encoding (internal bug)";
+    case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+    case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+    case 89: return "text chunk keyword too short or long: must have size 1-79";
+        /*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+    case 90: return "windowsize must be a power of two";
+    case 91: return "invalid decompressed idat size";
+    case 92: return "too many pixels, not supported";
+    case 93: return "zero width or height is invalid";
+    case 94: return "header chunk must have a size of 13 bytes";
+    }
+    return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        long size = lodepng_filesize(filename.c_str());
+        if (size < 0) return 78;
+        buffer.resize((size_t)size);
+        return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+    }
+
+    /*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings)
+    {
+        return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings)
+    {
+        return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+    State::State()
+    {
+        lodepng_state_init(this);
+    }
+
+    State::State(const State& other)
+    {
+        lodepng_state_init(this);
+        lodepng_state_copy(this, &other);
+    }
+
+    State::~State()
+    {
+        lodepng_state_cleanup(this);
+    }
+
+    State& State::operator=(const State& other)
+    {
+        lodepng_state_copy(this, &other);
+        return *this;
+    }
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+        size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+        if (buffer && !error)
+        {
+            State state;
+            state.info_raw.colortype = colortype;
+            state.info_raw.bitdepth = bitdepth;
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize)
+    {
+        unsigned char* buffer = NULL;
+        unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+        if (buffer && !error)
+        {
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+        }
+        lodepng_free(buffer);
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in)
+    {
+        return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = load_file(buffer, filename);
+        if (error) return error;
+        return decode(out, w, h, buffer, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state)
+    {
+        if (lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+        if (!error) error = save_file(buffer, filename);
+        return error;
+    }
+
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.h b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.h
new file mode 100644
index 000000000..595312ca8
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/lodepng/lodepng.h
@@ -0,0 +1,1762 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType
+{
+    LCT_GREY = 0, /*greyscale: 1,2,4,8,16 bit*/
+    LCT_RGB = 2, /*RGB: 8,16 bit*/
+    LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+    LCT_GREY_ALPHA = 4, /*greyscale with alpha: 8,16 bit*/
+    LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+After decoding, its size is w * h * (bytes per pixel) bytes larger than
+initially. Bytes per pixel depends on colortype and bitdepth.
+Must be freed after usage with free(*out).
+Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+of the output PNG image cannot be chosen, they are automatically determined
+by the colortype, bitdepth and content of the input pixel data.
+Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    /*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+    is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const unsigned char* in, size_t insize,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts PNG file from disk to raw pixel data in memory.
+    Same as the other decode functions, but instead takes a filename as input.
+    */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::string& filename,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+    is that of the raw input data. The output PNG color type will be auto chosen.*/
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+    Same as the other encode functions, but instead takes a filename as output.
+    NOTE: This overwrites existing files without warning!
+    */
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+  /*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings
+{
+    unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+                             /*use custom zlib decoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+    /*use custom deflate decoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_inflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/
+{
+    /*LZ77 related settings*/
+    unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+    unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+    unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+    unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+    unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+    unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+                           /*use custom zlib encoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+    /*use custom deflate encoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_deflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode
+{
+    /*header (IHDR)*/
+    LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+    unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+                        /*
+                        palette (PLTE and tRNS)
+
+                        Dynamically allocated with the colors of the palette, including alpha.
+                        When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+                        lodepng_palette_clear, then for each color use lodepng_palette_add.
+                        If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+                        When decoding, by default you can ignore this palette, since LodePNG already
+                        fills the palette colors in the pixels of the raw RGBA output.
+
+                        The palette is only supported for color type 3.
+                        */
+    unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+    size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+                        /*
+                        transparent color key (tRNS)
+
+                        This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+                        For greyscale PNGs, r, g and b will all 3 be set to the same.
+
+                        When decoding, by default you can ignore this information, since LodePNG sets
+                        pixels with this key to transparent already in the raw RGBA output.
+
+                        The color key is only supported for color types 0 and 2.
+                        */
+    unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+    unsigned key_r;       /*red/greyscale component of color key*/
+    unsigned key_g;       /*green component of color key*/
+    unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a greyscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime
+{
+    unsigned year;    /*2 bytes used (0-65535)*/
+    unsigned month;   /*1-12*/
+    unsigned day;     /*1-31*/
+    unsigned hour;    /*0-23*/
+    unsigned minute;  /*0-59*/
+    unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo
+{
+    /*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+    unsigned compression_method;/*compression method of the original file. Always 0.*/
+    unsigned filter_method;     /*filter method of the original file*/
+    unsigned interlace_method;  /*interlace method of the original file*/
+    LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+                                /*
+                                suggested background color chunk (bKGD)
+                                This color uses the same color mode as the PNG (except alpha channel), which can be 1-bit to 16-bit.
+
+                                For greyscale PNGs, r, g and b will all 3 be set to the same. When encoding
+                                the encoder writes the red one. For palette PNGs: When decoding, the RGB value
+                                will be stored, not a palette index. But when encoding, specify the index of
+                                the palette in background_r, the other two are then ignored.
+
+                                The decoder does not use this background color to edit the color of pixels.
+                                */
+    unsigned background_defined; /*is a suggested background color given?*/
+    unsigned background_r;       /*red component of suggested background color*/
+    unsigned background_g;       /*green component of suggested background color*/
+    unsigned background_b;       /*blue component of suggested background color*/
+
+                                 /*
+                                 non-international text chunks (tEXt and zTXt)
+
+                                 The char** arrays each contain num strings. The actual messages are in
+                                 text_strings, while text_keys are keywords that give a short description what
+                                 the actual text represents, e.g. Title, Author, Description, or anything else.
+
+                                 A keyword is minimum 1 character and maximum 79 characters long. It's
+                                 discouraged to use a single line length longer than 79 characters for texts.
+
+                                 Don't allocate these text buffers yourself. Use the init/cleanup functions
+                                 correctly and use lodepng_add_text and lodepng_clear_text.
+                                 */
+    size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+    char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+    char** text_strings; /*the actual text*/
+
+                         /*
+                         international text chunks (iTXt)
+                         Similar to the non-international text chunks, but with additional strings
+                         "langtags" and "transkeys".
+                         */
+    size_t itext_num; /*the amount of international texts in this PNG*/
+    char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+    char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+    char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+    char** itext_strings; /*the actual international text - UTF-8 string*/
+
+                          /*time chunk (tIME)*/
+    unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+    LodePNGTime time;
+
+    /*phys chunk (pHYs)*/
+    unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+    unsigned phys_x; /*pixels per unit in x direction*/
+    unsigned phys_y; /*pixels per unit in y direction*/
+    unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+                        /*
+                        unknown chunks
+                        There are 3 buffers, one for each position in the PNG where unknown chunks can appear
+                        each buffer contains all unknown chunks for that position consecutively
+                        The 3 buffers are the unknown chunks between certain critical chunks:
+                        0: IHDR-PLTE, 1: PLTE-IDAT, 2: IDAT-IEND
+                        Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+                        later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+                        */
+    unsigned char* unknown_chunks_data[3];
+    size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                                            /*
+                                            Converts raw buffer from one color type to another color type, based on
+                                            LodePNGColorMode structs to describe the input and output color type.
+                                            See the reference manual at the end of this header file to see which color conversions are supported.
+                                            return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+                                            The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+                                            of the output color type (lodepng_get_bpp).
+                                            For < 8 bpp images, there should not be padding bits at the end of scanlines.
+                                            For 16-bit per channel colors, uses big endian format like PNG does.
+                                            Return value is LodePNG error code
+                                            */
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings
+{
+    LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+    unsigned ignore_crc; /*ignore CRC checksums*/
+
+    unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+                               /*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+    unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy
+{
+    /*every filter at zero*/
+    LFS_ZERO,
+    /*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+    LFS_MINSUM,
+    /*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+    on the image, this is better or worse than minsum.*/
+    LFS_ENTROPY,
+    /*
+    Brute-force-search PNG filters by compressing each filter for each scanline.
+    Experimental, very slow, and only rarely gives better compression than MINSUM.
+    */
+    LFS_BRUTE_FORCE,
+    /*use predefined_filters buffer: you specify the filter type for each scanline*/
+    LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the colors of the image, which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.*/
+typedef struct LodePNGColorProfile
+{
+    unsigned colored; /*not greyscale*/
+    unsigned key; /*image is not opaque and color key is possible instead of full alpha*/
+    unsigned short key_r; /*key values, always as 16-bit, in 8-bit case the byte is duplicated, e.g. 65535 means 255*/
+    unsigned short key_g;
+    unsigned short key_b;
+    unsigned alpha; /*image is not opaque and alpha channel or alpha palette required*/
+    unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+    unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+    unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for greyscale only. 16 if 16-bit per channel required.*/
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. grey if only grey pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings
+{
+    LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+    unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+                           /*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+                           8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+                           completely follow the official PNG heuristic, filter_palette_zero must be true and
+                           filter_strategy must be LFS_MINSUM*/
+    unsigned filter_palette_zero;
+    /*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+    Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+    LodePNGFilterStrategy filter_strategy;
+    /*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+    the same length as the amount of scanlines in the image, and each value must <= 5. You
+    have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+    must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+    const unsigned char* predefined_filters;
+
+    /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+    If colortype is 3, PLTE is _always_ created.*/
+    unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*add LodePNG identifier and version as a text chunk, for debugging*/
+    unsigned add_id;
+    /*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+    unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+    LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+    unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+    /* For the lodepng::State subclass. */
+    virtual ~LodePNGState() {}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the header chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+PNG standard chunk naming conventions:
+First byte: uppercase = critical, lowercase = ancillary
+Second byte: uppercase = public, lowercase = private
+Third byte: must be uppercase
+Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*iterate to next chunks. don't use on IEND chunk, as there is no next chunk then*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_PNG
+    class State : public LodePNGState
+    {
+    public:
+        State();
+        State(const State& other);
+        virtual ~State();
+        State& operator=(const State& other);
+    };
+
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Same as other lodepng::decode, but using a State for more settings and information. */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Same as other lodepng::encode, but using a State for more settings and information. */
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Load a file from disk into an std::vector.
+    return value: error code (0 means ok)
+    */
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+    /*
+    Save the binary data in an std::vector to a file on disk. The file is overwritten
+    without warning.
+    */
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Zlib-decompress an unsigned char buffer */
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+    /* Zlib-decompress an std::vector */
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Zlib-compress an unsigned char buffer */
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+    /* Zlib-compress an std::vector */
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+  /*
+  TODO:
+  [.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+  [.] check compatibility with various compilers  - done but needs to be redone for every newer version
+  [X] converting color to 16-bit per channel types
+  [ ] read all public PNG chunk types (but never let the color profile and gamma ones touch RGB values)
+  [ ] make sure encoder generates no chunks with size > (2^31)-1
+  [ ] partial decoding (stream processing)
+  [X] let the "isFullyOpaque" function check color keys and transparent palettes too
+  [X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+  [ ] don't stop decoding on errors like 69, 57, 58 (make warnings)
+  [ ] make warnings like: oob palette, checksum fail, data after iend, wrong/unknown crit chunk, no null terminator in text, ...
+  [ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+  [ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+  [ ] allow user to give data (void*) to custom allocator
+  */
+
+#endif /*LODEPNG_H inclusion guard*/
+
+  /*
+  LodePNG Documentation
+  ---------------------
+
+  0. table of contents
+  --------------------
+
+  1. about
+  1.1. supported features
+  1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+  6.1. PNG color types
+  6.2. color conversions
+  6.3. padding bits
+  6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+  10.1. decoder C++ example
+  10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+  1. about
+  --------
+
+  PNG is a file format to store raster images losslessly with good compression,
+  supporting different color types and alpha channel.
+
+  LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+  Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+  The specifications used are:
+
+  *) Portable Network Graphics (PNG) Specification (Second Edition):
+  http://www.w3.org/TR/2003/REC-PNG-20031110
+  *) RFC 1950 ZLIB Compressed Data Format version 3.3:
+  http://www.gzip.org/zlib/rfc-zlib.html
+  *) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+  http://www.gzip.org/zlib/rfc-deflate.html
+
+  The most recent version of LodePNG can currently be found at
+  http://lodev.org/lodepng/
+
+  LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+  extra functionality.
+
+  LodePNG exists out of two files:
+  -lodepng.h: the header file for both C and C++
+  -lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+  If you want to start using LodePNG right away without reading this doc, get the
+  examples from the LodePNG website to see how to use it in code, or check the
+  smaller examples in chapter 13 here.
+
+  LodePNG is simple but only supports the basic requirements. To achieve
+  simplicity, the following design choices were made: There are no dependencies
+  on any external library. There are functions to decode and encode a PNG with
+  a single function call, and extended versions of these functions taking a
+  LodePNGState struct allowing to specify or get more information. By default
+  the colors of the raw image are always RGB or RGBA, no matter what color type
+  the PNG file uses. To read and write files, there are simple functions to
+  convert the files to/from buffers in memory.
+
+  This all makes LodePNG suitable for loading textures in games, demos and small
+  programs, ... It's less suitable for full fledged image editors, loading PNGs
+  over network (it requires all the image data to be available before decoding can
+  begin), life-critical systems, ...
+
+  1.1. supported features
+  -----------------------
+
+  The following features are supported by the decoder:
+
+  *) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+  or the same color type as the PNG
+  *) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+  *) Adam7 interlace and deinterlace for any color type
+  *) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+  *) support for alpha channels, including RGBA color model, translucent palettes and color keying
+  *) zlib decompression (inflate)
+  *) zlib compression (deflate)
+  *) CRC32 and ADLER32 checksums
+  *) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+  *) the following chunks are supported (generated/interpreted) by both encoder and decoder:
+  IHDR: header information
+  PLTE: color palette
+  IDAT: pixel data
+  IEND: the final chunk
+  tRNS: transparency for palettized images
+  tEXt: textual information
+  zTXt: compressed textual information
+  iTXt: international textual information
+  bKGD: suggested background color
+  pHYs: physical dimensions
+  tIME: modification time
+
+  1.2. features not supported
+  ---------------------------
+
+  The following features are _not_ supported:
+
+  *) some features needed to make a conformant PNG-Editor might be still missing.
+  *) partial loading/stream processing. All data must be available and is processed in one call.
+  *) The following public chunks are not supported but treated as unknown chunks by LodePNG
+  cHRM, gAMA, iCCP, sRGB, sBIT, hIST, sPLT
+  Some of these are not supported on purpose: LodePNG wants to provide the RGB values
+  stored in the pixels, not values modified by system dependent gamma or color models.
+
+
+  2. C and C++ version
+  --------------------
+
+  The C version uses buffers allocated with alloc that you need to free()
+  yourself. You need to use init and cleanup functions for each struct whenever
+  using a struct from the C version to avoid exploits and memory leaks.
+
+  The C++ version has extra functions with std::vectors in the interface and the
+  lodepng::State class which is a LodePNGState with constructor and destructor.
+
+  These files work without modification for both C and C++ compilers because all
+  the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+  ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+  To use the C++ version, you need to rename the source file to lodepng.cpp
+  (instead of lodepng.c), and compile it with a C++ compiler.
+
+  To use the C version, you need to rename the source file to lodepng.c (instead
+  of lodepng.cpp), and compile it with a C compiler.
+
+
+  3. Security
+  -----------
+
+  Even if carefully designed, it's always possible that LodePNG contains possible
+  exploits. If you discover one, please let me know, and it will be fixed.
+
+  When using LodePNG, care has to be taken with the C version of LodePNG, as well
+  as the C-style structs when working with C++. The following conventions are used
+  for all C-style structs:
+
+  -if a struct has a corresponding init function, always call the init function when making a new one
+  -if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+  -if a struct has a corresponding copy function, use the copy function instead of "=".
+  The destination must also be inited already.
+
+
+  4. Decoding
+  -----------
+
+  Decoding converts a PNG compressed image to a raw pixel buffer.
+
+  Most documentation on using the decoder is at its declarations in the header
+  above. For C, simple decoding can be done with functions such as
+  lodepng_decode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+  various lodepng::decode functions, and lodepng::State can be used for advanced
+  features.
+
+  When using the LodePNGState, it uses the following fields for decoding:
+  *) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+  *) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+  *) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  After decoding, this contains extra information of the PNG image, except the actual
+  pixels, width and height because these are already gotten directly from the decoder
+  functions.
+
+  It contains for example the original color type of the PNG image, text comments,
+  suggested background color, etc... More details about the LodePNGInfo struct are
+  at its declaration documentation.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  When decoding, here you can specify which color type you want
+  the resulting raw image to be. If this is different from the colortype of the
+  PNG, then the decoder will automatically convert the result. This conversion
+  always works, except if you want it to convert a color PNG to greyscale or to
+  a palette with missing colors.
+
+  By default, 32-bit color is used for the result.
+
+  LodePNGDecoderSettings decoder
+  ------------------------------
+
+  The settings can be used to ignore the errors created by invalid CRC and Adler32
+  chunks, and to disable the decoding of tEXt chunks.
+
+  There's also a setting color_convert, true by default. If false, no conversion
+  is done, the resulting data will be as it was in the PNG (after decompression)
+  and you'll have to puzzle the colors of the pixels together yourself using the
+  color type information in the LodePNGInfo.
+
+
+  5. Encoding
+  -----------
+
+  Encoding converts a raw pixel buffer to a PNG compressed image.
+
+  Most documentation on using the encoder is at its declarations in the header
+  above. For C, simple encoding can be done with functions such as
+  lodepng_encode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+  various lodepng::encode functions, and lodepng::State can be used for advanced
+  features.
+
+  Like the decoder, the encoder can also give errors. However it gives less errors
+  since the encoder input is trusted, the decoder input (a PNG image that could
+  be forged by anyone) is not trusted.
+
+  When using the LodePNGState, it uses the following fields for encoding:
+  *) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+  *) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+  *) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  When encoding, you use this the opposite way as when decoding: for encoding,
+  you fill in the values you want the PNG to have before encoding. By default it's
+  not needed to specify a color type for the PNG since it's automatically chosen,
+  but it's possible to choose it yourself given the right settings.
+
+  The encoder will not always exactly match the LodePNGInfo struct you give,
+  it tries as close as possible. Some things are ignored by the encoder. The
+  encoder uses, for example, the following settings from it when applicable:
+  colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+  background color, the interlace method, unknown chunks, ...
+
+  When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+  If the palette contains any colors for which the alpha channel is not 255 (so
+  there are translucent colors in the palette), it'll add a tRNS chunk.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  You specify the color type of the raw image that you give to the input here,
+  including a possible transparent color key and palette you happen to be using in
+  your raw image data.
+
+  By default, 32-bit color is assumed, meaning your input has to be in RGBA
+  format with 4 bytes (unsigned chars) per pixel.
+
+  LodePNGEncoderSettings encoder
+  ------------------------------
+
+  The following settings are supported (some are in sub-structs):
+  *) auto_convert: when this option is enabled, the encoder will
+  automatically choose the smallest possible color mode (including color key) that
+  can encode the colors of all pixels without information loss.
+  *) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+  2 = dynamic huffman tree (best compression). Should be 2 for proper
+  compression.
+  *) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+  true for proper compression.
+  *) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+  2048 by default, but can be set to 32768 for better, but slow, compression.
+  *) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+  chunk if force_palette is true. This can used as suggested palette to convert
+  to by viewers that don't support more than 256 colors (if those still exist)
+  *) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+  *) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+  6. color conversions
+  --------------------
+
+  An important thing to note about LodePNG, is that the color type of the PNG, and
+  the color type of the raw image, are completely independent. By default, when
+  you decode a PNG, you get the result as a raw image in the color type you want,
+  no matter whether the PNG was encoded with a palette, greyscale or RGBA color.
+  And if you encode an image, by default LodePNG will automatically choose the PNG
+  color type that gives good compression based on the values of colors and amount
+  of colors in the image. It can be configured to let you control it instead as
+  well, though.
+
+  To be able to do this, LodePNG does conversions from one color mode to another.
+  It can convert from almost any color type to any other color type, except the
+  following conversions: RGB to greyscale is not supported, and converting to a
+  palette when the palette doesn't have a required color is not supported. This is
+  not supported on purpose: this is information loss which requires a color
+  reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to grey
+  is easy, but there are multiple ways if you want to give some channels more
+  weight).
+
+  By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+  color, no matter what color type the PNG has. And by default when encoding,
+  LodePNG automatically picks the best color model for the output PNG, and expects
+  the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+  the color format of the images yourself, you can skip this chapter.
+
+  6.1. PNG color types
+  --------------------
+
+  A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+  as well as palettized color modes. After the zlib decompression and unfiltering
+  in the PNG image is done, the raw pixel data will have that color type and thus
+  a certain amount of bits per pixel. If you want the output raw image after
+  decoding to have another color type, a conversion is done by LodePNG.
+
+  The PNG specification gives the following color types:
+
+  0: greyscale, bit depths 1, 2, 4, 8, 16
+  2: RGB, bit depths 8 and 16
+  3: palette, bit depths 1, 2, 4 and 8
+  4: greyscale with alpha, bit depths 8 and 16
+  6: RGBA, bit depths 8 and 16
+
+  Bit depth is the amount of bits per pixel per color channel. So the total amount
+  of bits per pixel is: amount of channels * bitdepth.
+
+  6.2. color conversions
+  ----------------------
+
+  As explained in the sections about the encoder and decoder, you can specify
+  color types and bit depths in info_png and info_raw to change the default
+  behaviour.
+
+  If, when decoding, you want the raw image to be something else than the default,
+  you need to set the color type and bit depth you want in the LodePNGColorMode,
+  or the parameters colortype and bitdepth of the simple decoding function.
+
+  If, when encoding, you use another color type than the default in the raw input
+  image, you need to specify its color type and bit depth in the LodePNGColorMode
+  of the raw image, or use the parameters colortype and bitdepth of the simple
+  encoding function.
+
+  If, when encoding, you don't want LodePNG to choose the output PNG color type
+  but control it yourself, you need to set auto_convert in the encoder settings
+  to false, and specify the color type you want in the LodePNGInfo of the
+  encoder (including palette: it can generate a palette if auto_convert is true,
+  otherwise not).
+
+  If the input and output color type differ (whether user chosen or auto chosen),
+  LodePNG will do a color conversion, which follows the rules below, and may
+  sometimes result in an error.
+
+  To avoid some confusion:
+  -the decoder converts from PNG to raw image
+  -the encoder converts from raw image to PNG
+  -the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+  -the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+  -when encoding, the color type in LodePNGInfo is ignored if auto_convert
+  is enabled, it is automatically generated instead
+  -when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+  PNG image, but it can be ignored since the raw image has the color type you requested instead
+  -if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+  between the color types is done if the color types are supported. If it is not
+  supported, an error is returned. If the types are the same, no conversion is done.
+  -even though some conversions aren't supported, LodePNG supports loading PNGs from any
+  colortype and saving PNGs to any colortype, sometimes it just requires preparing
+  the raw image correctly before encoding.
+  -both encoder and decoder use the same color converter.
+
+  Non supported color conversions:
+  -color to greyscale: no error is thrown, but the result will look ugly because
+  only the red channel is taken
+  -anything to palette when that palette does not have that color in it: in this
+  case an error is thrown
+
+  Supported color conversions:
+  -anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+  -any grey or grey+alpha, to grey or grey+alpha
+  -anything to a palette, as long as the palette has the requested colors in it
+  -removing alpha channel
+  -higher to smaller bitdepth, and vice versa
+
+  If you want no color conversion to be done (e.g. for speed or control):
+  -In the encoder, you can make it save a PNG with any color type by giving the
+  raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+  false.
+  -In the decoder, you can make it store the pixel data in the same color type
+  as the PNG has, by setting the color_convert setting to false. Settings in
+  info_raw are then ignored.
+
+  The function lodepng_convert does the color conversion. It is available in the
+  interface but normally isn't needed since the encoder and decoder already call
+  it.
+
+  6.3. padding bits
+  -----------------
+
+  In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+  have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+  scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+  The raw input image you give to the encoder, and the raw output image you get from the decoder
+  will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+  of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+  not the first bit of a new byte.
+
+  6.4. A note about 16-bits per channel and endianness
+  ----------------------------------------------------
+
+  LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+  for any other color format. The 16-bit values are stored in big endian (most
+  significant byte first) in these arrays. This is the opposite order of the
+  little endian used by x86 CPU's.
+
+  LodePNG always uses big endian because the PNG file format does so internally.
+  Conversions to other formats than PNG uses internally are not supported by
+  LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+  colors, the order in which you store R, G, B and A, and so on. Supporting and
+  converting to/from all that is outside the scope of LodePNG.
+
+  This may mean that, depending on your use case, you may want to convert the big
+  endian output of LodePNG to little endian with a for loop. This is certainly not
+  always needed, many applications and libraries support big endian 16-bit colors
+  anyway, but it means you cannot simply cast the unsigned char* buffer to an
+  unsigned short* buffer on x86 CPUs.
+
+
+  7. error values
+  ---------------
+
+  All functions in LodePNG that return an error code, return 0 if everything went
+  OK, or a non-zero code if there was an error.
+
+  The meaning of the LodePNG error values can be retrieved with the function
+  lodepng_error_text: given the numerical error code, it returns a description
+  of the error in English as a string.
+
+  Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+  8. chunks and PNG editing
+  -------------------------
+
+  If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+  editor that should follow the rules about handling of unknown chunks, or if your
+  program is able to read other types of chunks than the ones handled by LodePNG,
+  then that's possible with the chunk functions of LodePNG.
+
+  A PNG chunk has the following layout:
+
+  4 bytes length
+  4 bytes type name
+  length bytes data
+  4 bytes CRC
+
+  8.1. iterating through chunks
+  -----------------------------
+
+  If you have a buffer containing the PNG image data, then the first chunk (the
+  IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+  signature of the PNG and are not part of a chunk. But if you start at byte 8
+  then you have a chunk, and can check the following things of it.
+
+  NOTE: none of these functions check for memory buffer boundaries. To avoid
+  exploits, always make sure the buffer contains all the data of the chunks.
+  When using lodepng_chunk_next, make sure the returned value is within the
+  allocated memory.
+
+  unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+  Get the length of the chunk's data. The total chunk length is this length + 12.
+
+  void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+  unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+  Get the type of the chunk or compare if it's a certain type
+
+  unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+  unsigned char lodepng_chunk_private(const unsigned char* chunk):
+  unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+  Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+  Check if the chunk is private (public chunks are part of the standard, private ones not).
+  Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+  chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+  program doesn't handle that type of unknown chunk.
+
+  unsigned char* lodepng_chunk_data(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+  Get a pointer to the start of the data of the chunk.
+
+  unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+  void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+  Check if the crc is correct or generate a correct one.
+
+  unsigned char* lodepng_chunk_next(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+  Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+  functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+  data available in the buffer to be able to go to the next chunk.
+
+  unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+  unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+  const char* type, const unsigned char* data):
+
+  These functions are used to create new chunks that are appended to the data in *out that has
+  length *outlength. The append function appends an existing chunk to the new data. The create
+  function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+  name of the chunk.
+
+  8.2. chunks in info_png
+  -----------------------
+
+  The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+  buffers (each with size) to contain 3 types of unknown chunks:
+  the ones that come before the PLTE chunk, the ones that come between the PLTE
+  and the IDAT chunks, and the ones that come after the IDAT chunks.
+  It's necessary to make the distionction between these 3 cases because the PNG
+  standard forces to keep the ordering of unknown chunks compared to the critical
+  chunks, but does not force any other ordering rules.
+
+  info_png.unknown_chunks_data[0] is the chunks before PLTE
+  info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+  info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+  The chunks in these 3 buffers can be iterated through and read by using the same
+  way described in the previous subchapter.
+
+  When using the decoder to decode a PNG, you can make it store all unknown chunks
+  if you set the option settings.remember_unknown_chunks to 1. By default, this
+  option is off (0).
+
+  The encoder will always encode unknown chunks that are stored in the info_png.
+  If you need it to add a particular chunk that isn't known by LodePNG, you can
+  use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+  info_png.unknown_chunks_data[x].
+
+  Chunks that are known by LodePNG should not be added in that way. E.g. to make
+  LodePNG add a bKGD chunk, set background_defined to true and add the correct
+  parameters there instead.
+
+
+  9. compiler support
+  -------------------
+
+  No libraries other than the current standard C library are needed to compile
+  LodePNG. For the C++ version, only the standard C++ library is needed on top.
+  Add the files lodepng.c(pp) and lodepng.h to your project, include
+  lodepng.h where needed, and your program can read/write PNG files.
+
+  It is compatible with C90 and up, and C++03 and up.
+
+  If performance is important, use optimization when compiling! For both the
+  encoder and decoder, this makes a large difference.
+
+  Make sure that LodePNG is compiled with the same compiler of the same version
+  and with the same settings as the rest of the program, or the interfaces with
+  std::vectors and std::strings in C++ can be incompatible.
+
+  CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+  *) gcc and g++
+
+  LodePNG is developed in gcc so this compiler is natively supported. It gives no
+  warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+  version 4.7.1 on Linux, 32-bit and 64-bit.
+
+  *) Clang
+
+  Fully supported and warning-free.
+
+  *) Mingw
+
+  The Mingw compiler (a port of gcc for Windows) should be fully supported by
+  LodePNG.
+
+  *) Visual Studio and Visual C++ Express Edition
+
+  LodePNG should be warning-free with warning level W4. Two warnings were disabled
+  with pragmas though: warning 4244 about implicit conversions, and warning 4996
+  where it wants to use a non-standard function fopen_s instead of the standard C
+  fopen.
+
+  Visual Studio may want "stdafx.h" files to be included in each source file and
+  give an error "unexpected end of file while looking for precompiled header".
+  This is not standard C++ and will not be added to the stock LodePNG. You can
+  disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+  Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+  NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+  VS6, are not guaranteed to work.
+
+  *) Compilers on Macintosh
+
+  LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+  C and C++.
+
+  *) Other Compilers
+
+  If you encounter problems on any compilers, feel free to let me know and I may
+  try to fix it if the compiler is modern and standards complient.
+
+
+  10. examples
+  ------------
+
+  This decoder example shows the most basic usage of LodePNG. More complex
+  examples can be found on the LodePNG website.
+
+  10.1. decoder C++ example
+  -------------------------
+
+  #include "lodepng.h"
+  #include <iostream>
+
+  int main(int argc, char *argv[])
+  {
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+  }
+
+  10.2. decoder C example
+  -----------------------
+
+  #include "lodepng.h"
+
+  int main(int argc, char *argv[])
+  {
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+  }
+
+  11. state settings reference
+  ----------------------------
+
+  A quick reference of some settings to set on the LodePNGState
+
+  For decoding:
+
+  state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+  state.decoder.zlibsettings.custom_...: use custom inflate function
+  state.decoder.ignore_crc: ignore CRC checksums
+  state.decoder.color_convert: convert internal PNG color to chosen one
+  state.decoder.read_text_chunks: whether to read in text metadata chunks
+  state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+  state.info_raw.colortype: desired color type for decoded image
+  state.info_raw.bitdepth: desired bit depth for decoded image
+  state.info_raw....: more color settings, see struct LodePNGColorMode
+  state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+  For encoding:
+
+  state.encoder.zlibsettings.btype: disable compression by setting it to 0
+  state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+  state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+  state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+  state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+  state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+  state.encoder.zlibsettings.custom_...: use custom deflate function
+  state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+  state.encoder.filter_palette_zero: PNG filter strategy for palette
+  state.encoder.filter_strategy: PNG filter strategy to encode with
+  state.encoder.force_palette: add palette even if not encoding to one
+  state.encoder.add_id: add LodePNG identifier and version as a text chunk
+  state.encoder.text_compression: use compressed text chunks for metadata
+  state.info_raw.colortype: color type of raw input image you provide
+  state.info_raw.bitdepth: bit depth of raw input image you provide
+  state.info_raw: more color settings, see struct LodePNGColorMode
+  state.info_png.color.colortype: desired color type if auto_convert is false
+  state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+  state.info_png.color....: more color settings, see struct LodePNGColorMode
+  state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+  12. changes
+  -----------
+
+  The version number of LodePNG is the date of the change given in the format
+  yyyymmdd.
+
+  Some changes aren't backwards compatible. Those are indicated with a (!)
+  symbol.
+
+  *) 17 sep 2017: fix memory leak for some encoder input error cases
+  *) 27 nov 2016: grey+alpha auto color model detection bugfix
+  *) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+  *) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+  the limits of pure C90).
+  *) 08 dec 2015: Made load_file function return error if file can't be opened.
+  *) 24 okt 2015: Bugfix with decoding to palette output.
+  *) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+  *) 23 aug 2014: Reduced needless memory usage of decoder.
+  *) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+  simplicity. Made ColorProfile public.
+  *) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+  *) 22 dec 2013: Power of two windowsize required for optimization.
+  *) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+  *) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+  *) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+  prefix for the custom allocators and made it possible with a new #define to
+  use custom ones in your project without needing to change lodepng's code.
+  *) 28 jan 2013: Bugfix with color key.
+  *) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+  *) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+  (no palette). Better deflate tree encoding. New compression tweak settings.
+  Faster color conversions while decoding. Some internal cleanups.
+  *) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+  *) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+  and made it work with function pointers instead.
+  *) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+  and free functions and toggle #defines from compiler flags. Small fixes.
+  *) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+  *) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+  redundant C++ codec classes. Reduced amount of structs. Everything changed,
+  but it is cleaner now imho and functionality remains the same. Also fixed
+  several bugs and shrunk the implementation code. Made new samples.
+  *) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+  PNG color model and bit depth, based on the amount and type of colors of the
+  raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+  *) 9 okt 2011: simpler hash chain implementation for the encoder.
+  *) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+  *) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+  A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+  better ones (it's quite significant). A setting to do an experimental, slow,
+  brute force search for PNG filter types is added.
+  *) 17 aug 2011 (!): changed some C zlib related function names.
+  *) 16 aug 2011: made the code less wide (max 120 characters per line).
+  *) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+  *) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+  *) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+  to optimize long sequences of zeros.
+  *) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+  LodePNG_InfoColor_canHaveAlpha functions for convenience.
+  *) 7 nov 2010: added LodePNG_error_text function to get error code description.
+  *) 30 okt 2010: made decoding slightly faster
+  *) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+  Reorganized the documentation and the declaration order in the header.
+  *) 08 aug 2010: only changed some comments and external samples.
+  *) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+  *) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+  *) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+  read by ignoring the problem but windows apps couldn't.
+  *) 06 jun 2008: added more error checks for out of memory cases.
+  *) 26 apr 2008: added a few more checks here and there to ensure more safety.
+  *) 06 mar 2008: crash with encoding of strings fixed
+  *) 02 feb 2008: support for international text chunks added (iTXt)
+  *) 23 jan 2008: small cleanups, and #defines to divide code in sections
+  *) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+  *) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+  *) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+  Also various fixes, such as in the deflate and the padding bits code.
+  *) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+  filtering code of encoder.
+  *) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+  C++ wrapper around this provides an interface almost identical to before.
+  Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+  are together in these files but it works both for C and C++ compilers.
+  *) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+  *) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+  *) 09 aug 2007: some VS2005 warnings removed again
+  *) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+  *) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+  *) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+  invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+  *) 02 jun 2007: made the encoder add a tag with version by default
+  *) 27 may 2007: zlib and png code separated (but still in the same file),
+  simple encoder/decoder functions added for more simple usage cases
+  *) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+  moved some examples from here to lodepng_examples.cpp
+  *) 12 may 2007: palette decoding bug fixed
+  *) 24 apr 2007: changed the license from BSD to the zlib license
+  *) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+  *) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+  palettized PNG images. Plus little interface change with palette and texts.
+  *) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+  Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+  *) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+  and supported by the encoder, resulting in smaller PNGs at the output.
+  *) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+  *) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+  greyscale type to 8-bit greyscale with or without alpha.
+  *) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+  to convert to and is more uniform. See the manual for how it works now.
+  *) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+  encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+  at last made the decoder give errors for incorrect Adler32 or Crc.
+  *) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+  *) 29 dec 2006: Added support for encoding images without alpha channel, and
+  cleaned out code as well as making certain parts faster.
+  *) 28 dec 2006: Added "Settings" to the encoder.
+  *) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+  Removed some code duplication in the decoder. Fixed little bug in an example.
+  *) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+  Fixed a bug of the decoder with 16-bit per color.
+  *) 15 okt 2006: Changed documentation structure
+  *) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+  given image buffer, however for now it's not compressed.
+  *) 08 sep 2006: (!) Changed to interface with a Decoder class
+  *) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+  way. Renamed decodePNG to decodePNGGeneric.
+  *) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+  struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+  *) 28 jul 2006: Cleaned the code and added new error checks.
+  Corrected terminology "deflate" into "inflate".
+  *) 23 jun 2006: Added SDL example in the documentation in the header, this
+  example allows easy debugging by displaying the PNG and its transparency.
+  *) 22 jun 2006: (!) Changed way to obtain error value. Added
+  loadFile function for convenience. Made decodePNG32 faster.
+  *) 21 jun 2006: (!) Changed type of info vector to unsigned.
+  Changed position of palette in info vector. Fixed an important bug that
+  happened on PNGs with an uncompressed block.
+  *) 16 jun 2006: Internally changed unsigned into unsigned where
+  needed, and performed some optimizations.
+  *) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+  in LodePNG namespace. Changed the order of the parameters. Rewrote the
+  documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+  *) 22 apr 2006: Optimized and improved some code
+  *) 07 sep 2005: (!) Changed to std::vector interface
+  *) 12 aug 2005: Initial release (C++, decoder only)
+
+
+  13. contact information
+  -----------------------
+
+  Feel free to contact me with suggestions, problems, comments, ... concerning
+  LodePNG. If you encounter a PNG image that doesn't work properly with this
+  decoder, feel free to send it and I'll use it to find and fix the problem.
+
+  My email address is (puzzle the account and domain together with an @ symbol):
+  Domain: gmail dot com.
+  Account: lode dot vandevenne.
+
+
+  Copyright (c) 2005-2017 Lode Vandevenne
+  */
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/main.cpp b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/main.cpp
new file mode 100644
index 000000000..a3ebb66ec
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/main.cpp
@@ -0,0 +1,234 @@
+// Minimal C++ example for using CopyTensors EP agnostically
+// Model taken from : https://github.com/yakhyo/fast-neural-style-transfer under
+// MIT license Goals:
+//   - Avoid serial CPU <-> GPU transfers at each inference.
+//     * If really needed, demonstrate how to use asynchronous streams handle
+//     the transfers
+//
+
+#include <cstdlib>
+#include <filesystem>
+#include <string>
+constexpr int image_dim = 1080;
+
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <onnxruntime/core/session/onnxruntime_run_options_config_keys.h>
+#include <onnxruntime/core/session/onnxruntime_session_options_config_keys.h>
+#include <stdio.h>
+
+#include <format>
+
+#include "utils.h"
+
+using StreamUniquePtr = std::unique_ptr<OrtSyncStream, std::function<void(OrtSyncStream*)>>;
+using OrtFileString = std::basic_string<ORTCHAR_T>;
+
+static OrtFileString toOrtFileString(const std::filesystem::path& path) {
+  std::string string(path.string());
+  return {string.begin(), string.end()};
+}
+
+float cpuInputFloat[3 * image_dim * image_dim];
+float cpuOutputFloat[3 * image_dim * image_dim];
+
+#define PROVIDER_LIB_PAIR(NAME) \
+  std::pair { NAME, DLL_NAME("onnxruntime_providers_" NAME) }
+
+static void register_execution_providers(Ort::Env& env) {
+  // clang-format off
+  std::array provider_libraries{
+      PROVIDER_LIB_PAIR("nv_tensorrt_rtx"),
+      PROVIDER_LIB_PAIR("cuda"),
+      PROVIDER_LIB_PAIR("openvino"),
+      PROVIDER_LIB_PAIR("qnn"),
+      PROVIDER_LIB_PAIR("cann"),
+  };
+  // clang-format on
+
+  for (auto& [registration_name, dll] : provider_libraries) {
+    auto providers_library = get_executable_path().parent_path() / dll;
+    if (!std::filesystem::is_regular_file(providers_library)) {
+      LOG("{} does not exist! Skipping execution provider", providers_library.string());
+      continue;
+    }
+    try {
+      env.RegisterExecutionProviderLibrary(registration_name, toOrtFileString(providers_library));
+    } catch (std::exception& ex) {
+      LOG("Failed to register {}! Skipping execution provider", providers_library.string());
+    }
+  }
+}
+
+int main() {
+  try {
+    OrtApi const& ortApi = Ort::GetApi();
+    Ort::Env ortEnvironment(ORT_LOGGING_LEVEL_WARNING, "HelloOrtNv");
+    Ort::SessionOptions sessionOptions;
+    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    sessionOptions.DisableMemPattern();
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    CHECK_ORT(ortApi.AddFreeDimensionOverrideByName(sessionOptions, "batch_size", 1));
+
+    register_execution_providers(ortEnvironment);
+
+    sessionOptions.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_GPU);
+
+    const OrtEpDevice* const* ep_devices = nullptr;
+    size_t num_ep_devices;
+    CHECK_ORT(ortApi.GetEpDevices(ortEnvironment, &ep_devices, &num_ep_devices));
+
+    const OrtEpDevice* trt_ep_device = nullptr;
+    for (uint32_t i = 0; i < num_ep_devices; i++) {
+      if (strcmp(ortApi.EpDevice_EpName(ep_devices[i]),
+                 onnxruntime::kNvTensorRTRTXExecutionProvider) ==
+          0)  // checking sysnc_stream working only with TRTRTX EP
+      {
+        trt_ep_device = ep_devices[i];
+        break;
+      }
+    }
+    if (trt_ep_device == nullptr) {
+      LOG("Failed to find EP device with support for sync streams");
+      return EXIT_FAILURE;
+    }
+
+    OrtSyncStream* stream = nullptr;
+    StreamUniquePtr stream_ptr;
+    OrtSyncStream* upload_stream = nullptr;
+    StreamUniquePtr upload_stream_ptr;
+    CHECK_ORT(ortApi.CreateSyncStreamForEpDevice(trt_ep_device, nullptr, &stream));
+    CHECK_ORT(ortApi.CreateSyncStreamForEpDevice(trt_ep_device, nullptr, &upload_stream));
+    stream_ptr = StreamUniquePtr(stream, [ortApi](OrtSyncStream* stream) { ortApi.ReleaseSyncStream(stream); });
+    upload_stream_ptr = StreamUniquePtr(
+        upload_stream, [ortApi](OrtSyncStream* upload_stream) { ortApi.ReleaseSyncStream(upload_stream); });
+    size_t stream_addr = reinterpret_cast<size_t>(ortApi.SyncStream_GetHandle(stream));
+
+    std::string streamAddress = std::to_string(stream_addr);
+    const char* option_keys[] = {"user_compute_stream", "has_user_compute_stream"};
+    const char* option_values[] = {streamAddress.c_str(), "1"};
+    for (size_t i = 0; i < num_ep_devices; i++) {
+      if (strcmp(ortApi.EpDevice_EpName(ep_devices[i]), onnxruntime::kCpuExecutionProvider) != 0)
+        CHECK_ORT(ortApi.SessionOptionsAppendExecutionProvider_V2(sessionOptions, ortEnvironment, &ep_devices[i], 1,
+                                                                  option_keys, option_values, 2));
+    }
+
+    Ort::Session session(ortEnvironment, toOrtFileString(get_executable_parent_path() / "candy.onnx").c_str(),
+                         sessionOptions);
+    size_t num_inputs = session.GetInputCount();
+
+    const OrtEpDevice* session_epDevices = {nullptr};
+    CHECK_ORT(ortApi.SessionGetEpDeviceForInputs(session, &session_epDevices, num_inputs));
+
+    std::vector<Ort::Value> cpu_input_tensors;
+    std::vector<Ort::Value> cpu_output_tensors;
+    std::vector<const OrtValue*> src_tensor_ptrs;
+    std::vector<OrtValue*> dst_tensor_ptrs;
+    std::vector<Ort::Value> input_tensors;
+    std::vector<Ort::Value> output_tensors;
+
+    Ort::AllocatorWithDefaultOptions cpu_allocator;
+    Ort::AllocatedStringPtr InputTensorName = session.GetInputNameAllocated(0, cpu_allocator);
+    Ort::AllocatedStringPtr OutputTensorName = session.GetOutputNameAllocated(0, cpu_allocator);
+
+    std::vector<int64_t> input_shape{1, 3, image_dim, image_dim};
+    std::vector<float> input_data(3 * image_dim * image_dim, 0.0f);
+
+    loadInputImage(cpuInputFloat, (char*)(get_executable_parent_path() / "Input.png").c_str(), false);
+    for (int i = 0; i < 3 * image_dim * image_dim; i++) {
+      input_data[i] = cpuInputFloat[i];
+    }
+
+    Ort::Value input_value = Ort::Value::CreateTensor<float>(cpu_allocator.GetInfo(), input_data.data(),
+                                                             input_data.size(), input_shape.data(), input_shape.size());
+    cpu_input_tensors.push_back(std::move(input_value));
+
+    Ort::Value output_value = Ort::Value::CreateTensor<float>(
+        cpu_allocator.GetInfo(), cpuOutputFloat, 3 * image_dim * image_dim, input_shape.data(), input_shape.size());
+    cpu_output_tensors.push_back(std::move(output_value));
+
+    OrtMemoryInfo* input_memory_info_agnostic = nullptr;
+    for (size_t idx = 0; idx < num_inputs; ++idx) {
+      const OrtHardwareDevice* hw_device = ortApi.EpDevice_Device(session_epDevices);
+      auto vID = ortApi.HardwareDevice_VendorId(hw_device);
+      CHECK_ORT(ortApi.CreateMemoryInfo_V2("Input_Agnostic", OrtMemoryInfoDeviceType_GPU, /*vendor_id*/ vID,
+                                           /*device_id*/ 0, OrtDeviceMemoryType_DEFAULT, /*default alignment*/ 0,
+                                           OrtArenaAllocator, &input_memory_info_agnostic));
+
+      // const OrtMemoryInfo* mem_info = input_locations[idx];
+      const OrtMemoryInfo* mem_info = input_memory_info_agnostic;
+      OrtDeviceMemoryType mem_type = ortApi.MemoryInfoGetDeviceMemType(mem_info);
+      OrtMemoryInfoDeviceType device_type;
+      ortApi.MemoryInfoGetDeviceType(mem_info, &device_type);
+      const char* name;
+      CHECK_ORT(ortApi.MemoryInfoGetName(mem_info, &name));
+
+      if (device_type == OrtMemoryInfoDeviceType_GPU && mem_type == OrtDeviceMemoryType_DEFAULT) {
+        OrtAllocator* allocator = nullptr;
+        CHECK_ORT(ortApi.GetSharedAllocator(ortEnvironment, mem_info, &allocator));
+
+        // allocate new on-device memory
+        auto src_shape = cpu_input_tensors[idx].GetTensorTypeAndShapeInfo().GetShape();
+        Ort::Value device_input_value = Ort::Value::CreateTensor<float>(allocator, src_shape.data(), src_shape.size());
+
+        auto dst_shape = cpu_output_tensors[idx].GetTensorTypeAndShapeInfo().GetShape();
+        Ort::Value device_output_value = Ort::Value::CreateTensor<float>(allocator, dst_shape.data(), dst_shape.size());
+
+        src_tensor_ptrs.push_back(cpu_input_tensors[idx]);
+        dst_tensor_ptrs.push_back(device_input_value);
+        input_tensors.push_back(std::move(device_input_value));
+        output_tensors.push_back(std::move(device_output_value));
+      } else {
+        // input is on CPU accessible memory. move to input_tensors
+        input_tensors.push_back(std::move(cpu_input_tensors[idx]));
+        output_tensors.push_back(std::move(cpu_output_tensors[idx]));
+      }
+    }
+
+    std::vector<const char*> input_names = {"input"};
+    std::vector<const char*> output_names = {"output"};
+    Ort::Value output = Ort::Value(nullptr);
+
+    Ort::IoBinding iobinding(session);
+    if (!src_tensor_ptrs.empty()) {
+      CHECK_ORT(ortApi.CopyTensors(ortEnvironment, src_tensor_ptrs.data(), dst_tensor_ptrs.data(), stream,
+                                   src_tensor_ptrs.size()));
+      iobinding.BindInput(InputTensorName.get(), input_tensors[0]);
+      iobinding.BindOutput(OutputTensorName.get(), output_tensors[0]);
+      // iobinding.SynchronizeInputs();  // this doesn't actually require any
+      // bound inputs
+    }
+
+    Ort::RunOptions run_options;
+    run_options.AddConfigEntry(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "1");
+
+    if (!src_tensor_ptrs.empty()) {
+      for (int i = 0; i < 10; i++) {
+        session.Run(Ort::RunOptions{}, iobinding);
+        for (int j = 0; j < 10; j++) session.Run(run_options, iobinding);
+      }
+    } else {
+      for (int i = 0; i < 10; i++) {
+        session.Run(Ort::RunOptions{}, input_names.data(), input_tensors.data(), input_tensors.size(),
+                    output_names.data(), output_tensors.data(), output_tensors.size());
+        for (int j = 0; j < 10; j++)
+          session.Run(run_options, input_names.data(), input_tensors.data(), input_tensors.size(), output_names.data(),
+                      output_tensors.data(), output_tensors.size());
+      }
+    }
+
+    // Copy output from device to host
+    std::vector<const OrtValue*> output_src_tensor_ptrs = {output_tensors[0]};
+    std::vector<OrtValue*> output_dst_tensor_ptrs = {cpu_output_tensors[0]};
+    CHECK_ORT(
+        ortApi.CopyTensors(ortEnvironment, output_src_tensor_ptrs.data(), output_dst_tensor_ptrs.data(), stream, 1));
+
+    saveOutputImage(cpuOutputFloat, (char*)(get_executable_parent_path() / "output.png").c_str(), false);
+
+    ortApi.ReleaseMemoryInfo(input_memory_info_agnostic);
+  } catch (std::exception& ex) {
+    LOG("Error: {}", ex.what());
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.cpp b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.cpp
new file mode 100644
index 000000000..93daba163
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.cpp
@@ -0,0 +1,132 @@
+#include "utils.h"
+#include "half.hpp"
+#include "lodepng/lodepng.h"
+
+#ifdef _WIN32
+#include <windows.h>  // For GetModuleFileNameW
+#elif __APPLE__
+#include <limits.h>       // For PATH_MAX or similar
+#include <mach-o/dyld.h>  // For _NSGetExecutablePath
+#elif __linux__
+#include <limits.h>  // For PATH_MAX
+#include <unistd.h>  // For readlink
+#endif
+
+std::filesystem::path get_executable_parent_path() {
+  return get_executable_path().parent_path();
+}
+
+std::filesystem::path get_executable_path() {
+#ifdef _WIN32
+  // Windows: Use GetModuleFileNameW for wide characters
+  std::vector<wchar_t> pathBuf(MAX_PATH);
+  DWORD length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+
+  while (length == pathBuf.size()) {
+    pathBuf.resize(pathBuf.size() * 2);
+    length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+  }
+
+  if (length == 0) {
+    std::cerr << "Error: GetModuleFileNameW failed with error "
+              << GetLastError() << std::endl;
+    return {};
+  }
+  return std::filesystem::path(pathBuf.data());
+
+#elif __APPLE__
+  // macOS: Use _NSGetExecutablePath
+  std::vector<char> pathBuf(PATH_MAX);
+  uint32_t length = pathBuf.size();
+  if (_NSGetExecutablePath(pathBuf.data(), &length) != 0) {
+    // Buffer was too small, resize and try again
+    pathBuf.resize(length + 1);  // +1 for null terminator
+    _NSGetExecutablePath(pathBuf.data(), &length);
+  }
+  return std::filesystem::canonical(
+      pathBuf.data());  // canonical to resolve symlinks
+
+#elif __linux__
+  // Linux: Use /proc/self/exe symlink
+  return std::filesystem::canonical(
+      std::filesystem::read_symlink("/proc/self/exe"));
+#endif
+}
+
+using half_float::half;
+constexpr int image_dim = 1080;
+
+void loadInputImage(void* pData, char* imageFileName, bool fp16) {
+  half* hData = (half*)pData;
+  float* fData = (float*)pData;
+
+  unsigned char* image;
+  uint32_t width, height;
+  unsigned int error =
+      lodepng_decode32_file(&image, &width, &height, imageFileName);
+  if (error) {
+    printf("\nFailed to load the input image. Exiting\n");
+    exit(0);
+  }
+
+  if (width != image_dim || height != image_dim) {
+    printf("\nImage not of right size. Exiting\n");
+    exit(0);
+  }
+
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      unsigned char r = image[(y * width + x) * 4 + 0];
+      unsigned char g = image[(y * width + x) * 4 + 1];
+      unsigned char b = image[(y * width + x) * 4 + 2];
+
+      if (fp16) {
+        hData[0 * width * height + y * width + x] = (half)b;
+        hData[1 * width * height + y * width + x] = (half)g;
+        hData[2 * width * height + y * width + x] = (half)r;
+      } else {
+        fData[0 * width * height + y * width + x] = (float)b;
+        fData[1 * width * height + y * width + x] = (float)g;
+        fData[2 * width * height + y * width + x] = (float)r;
+      }
+    }
+
+  free(image);
+}
+
+unsigned char clampAndConvert(float val) {
+  if (val < 0)
+    val = 0;
+  if (val > 255)
+    val = 255;
+  return (unsigned char)val;
+}
+
+void saveOutputImage(void* pData, char* imageFileName, bool fp16) {
+  half* hData = (half*)pData;
+  float* fData = (float*)pData;
+
+  unsigned int width = image_dim, height = image_dim;  // hardcoded in the model
+
+  std::vector<unsigned char> image(width * height * 4);
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      float b, g, r;
+      if (fp16) {
+        b = (float)hData[0 * width * height + y * width + x];
+        g = (float)hData[1 * width * height + y * width + x];
+        r = (float)hData[2 * width * height + y * width + x];
+      } else {
+        b = fData[0 * width * height + y * width + x];
+        g = fData[1 * width * height + y * width + x];
+        r = fData[2 * width * height + y * width + x];
+      }
+
+      image[(y * width + x) * 4 + 0] = clampAndConvert(r);
+      image[(y * width + x) * 4 + 1] = clampAndConvert(g);
+      image[(y * width + x) * 4 + 2] = clampAndConvert(b);
+      image[(y * width + x) * 4 + 3] = 255;
+    }
+
+  lodepng_encode32_file(imageFileName, &image[0], width, height);
+}
diff --git a/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.h b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.h
new file mode 100644
index 000000000..082031e1b
--- /dev/null
+++ b/c_cxx/ort_tutorial/20_devicetensors-datatransfer/utils.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <filesystem>
+#include <format>
+#include <iostream>
+void loadInputImage(void *pData, char *imageFileName, bool fp16);
+void saveOutputImage(void *pData, char *imageFileName, bool fp16);
+
+#define DLL_NAME(name) (DLL_PREFIX name DLL_SUFFIX)
+#if _WIN32
+#define DLL_PREFIX ""
+#define DLL_SUFFIX ".dll"
+#else
+#define DLL_PREFIX "lib"
+#define DLL_SUFFIX ".so"
+#endif
+#define LOG(...) std::cout << std::format(__VA_ARGS__) << "\n"
+#define THROW_ERROR(...)                                                       \
+  LOG(__VA_ARGS__);                                                            \
+  throw std::runtime_error(std::format(__VA_ARGS__));
+#define CHECK_ORT(call)                                                        \
+  {                                                                            \
+    auto status = (call);                                                      \
+    if (status != nullptr) {                                                   \
+      THROW_ERROR("{}", Ort::GetApi().GetErrorMessage(status));                \
+    }                                                                          \
+  }
+
+std::filesystem::path get_executable_path();
+std::filesystem::path get_executable_parent_path();
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/CMakeLists.txt b/c_cxx/ort_tutorial/30_syncstreams-cuda/CMakeLists.txt
new file mode 100644
index 000000000..e581a8cb8
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.20)
+project(winai-samples)
+find_package(CUDAToolkit REQUIRED)
+
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+include(onnxruntimesetup)
+
+add_executable(syncstreams_cuda
+    main.cpp
+    lodepng/lodepng.cpp
+    utils.cpp
+    )
+
+set_target_properties(syncstreams_cuda PROPERTIES
+    CXX_STANDARD 20
+    CXX_EXTENSIONS OFF
+    )
+target_link_libraries(syncstreams_cuda PRIVATE
+  onnxruntime_interface
+  CUDA::cudart_static
+)
+target_include_directories(syncstreams_cuda PRIVATE
+    lode_png
+)
+
+set(ONNX "candy.onnx")
+
+copy_file_to_bin_dir(${ONNX})
+copy_file_to_bin_dir(Input.png)
+
+set_target_properties(syncstreams_cuda
+    PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    LIBRARY_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+    RUNTIME_OUTPUT_DIRECTORY "${RUNTIME_DIRECTORY}"
+)
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/Input.png b/c_cxx/ort_tutorial/30_syncstreams-cuda/Input.png
new file mode 100644
index 000000000..3d64ee0f2
Binary files /dev/null and b/c_cxx/ort_tutorial/30_syncstreams-cuda/Input.png differ
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/README.md b/c_cxx/ort_tutorial/30_syncstreams-cuda/README.md
new file mode 100644
index 000000000..939179bc1
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/README.md
@@ -0,0 +1,43 @@
+# SyncStream
+
+The latest revision of ORT provides a mechanism to sync between multiple streams. Consider an example where upload work happens on a stream and inference happens on a different stream, we want the inference stream to wait for the upload stream completion. This is now possible with the newly introduced ORT APIs.
+
+ORT introduces SyncStreams and SyncNotifications created from SyncStreams  to sync between streams. Activating a notification object is similar to Signal call and it also provides WaitOnDevice and WaitOnHost APIs for waiting on other streams.  
+The following example highlights the synchronisation between streams:
+
+```c
+const OrtSyncStreamImpl* uploadStreamImpl;
+OrtSyncNotificationImpl* uploadNotification;
+OrtEpApi ortEpApi = *ortApi.GetEpApi();
+uploadStreamImpl = ortEpApi.SyncStream_GetImpl(upload_stream);
+uploadStreamImpl->CreateNotification(const_cast<OrtSyncStreamImpl*>(uploadStreamImpl), &uploadNotification);
+
+// This should now be a truly asynchronous copy because the source (cpuInputFloat) is pinned memory.
+std::vector<const OrtValue*> cpu_src_ptrs = { full_cpu_tensor };
+std::vector<OrtValue*> gpu_dst_ptrs = { full_gpu_tensor };
+ortApi.CopyTensors(ortEnvironment, cpu_src_ptrs.data(), gpu_dst_ptrs.data(), upload_stream, cpu_src_ptrs.size());
+
+uploadNotification->Activate(uploadNotification);
+uploadNotification->WaitOnDevice(uploadNotification, stream);
+
+// work on the inference stream
+void* cuda_compute_stream_handle = ortApi.SyncStream_GetHandle(stream);
+cudaMemcpy2DAsync(..., static_cast<cudaStream_t>(cuda_compute_stream_handle));
+
+input_tensors.push_back(std::move(inference_gpu_input_tensor));
+output_tensors.push_back(std::move(inference_gpu_output_tensor));
+Ort::IoBinding iobinding(session);
+iobinding.BindInput(InputTensorName.get(), input_tensors[0]);
+iobinding.BindOutput(OutputTensorName.get(), output_tensors[0]);
+
+std::vector<const char*> input_names = { "input" };
+std::vector<const char*> output_names = { "output" };
+session.Run(Ort::RunOptions{}, input_names.data(), input_tensors.data(), input_tensors.size(), output_names.data(), output_tensors.data(), output_tensors.size());
+
+```
+
+It must be noted Syncstream currently is only tested to work with CUDA streams (so CUDA EP and TRT RTX EP only). We have used TRT RTX EP in our sample application.
+
+## Dependencies
+
+This sample vendors a copy of https://github.com/lvandeve/lodepng (Zlib license)
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/convert_to_fp16.py b/c_cxx/ort_tutorial/30_syncstreams-cuda/convert_to_fp16.py
new file mode 100644
index 000000000..ca2e28aa2
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/convert_to_fp16.py
@@ -0,0 +1,6 @@
+import onnx
+
+from onnxconverter_common import float16
+model = onnx.load("candy.onnx")
+model_fp16 = float16.convert_float_to_float16(model)
+onnx.save(model_fp16, "candy_fp16.onnx")
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/half.hpp b/c_cxx/ort_tutorial/30_syncstreams-cuda/half.hpp
new file mode 100644
index 000000000..d0a882dd6
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/half.hpp
@@ -0,0 +1,4601 @@
+// half - IEEE 754-based half-precision floating-point library.
+//
+// Copyright (c) 2012-2021 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 2.2.0
+
+/// \file
+/// Main header file for half-precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+#define HALF_GCC_VERSION (__GNUC__*100+__GNUC_MINOR__)
+
+#if defined(__INTEL_COMPILER)
+	#define HALF_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICC)
+	#define HALF_ICC_VERSION __ICC
+#elif defined(__ICL)
+	#define HALF_ICC_VERSION __ICL
+#else
+	#define HALF_ICC_VERSION 0
+#endif
+
+// check C++11 language features
+#if defined(__clang__)										// clang
+	#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__)		// Intel C++
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif defined(__GNUC__)										// gcc
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+		#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+			#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+		#endif
+		#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+			#define HALF_ENABLE_CPP11_USER_LITERALS 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+			#define HALF_ENABLE_CPP11_CONSTEXPR 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+			#define HALF_ENABLE_CPP11_NOEXCEPT 1
+		#endif
+		#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+			#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+		#endif
+		#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+			#define HALF_ENABLE_CPP11_LONG_LONG 1
+		#endif
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+#elif defined(_MSC_VER)										// Visual C++
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+	#define HALF_POP_WARNINGS 1
+	#pragma warning(push)
+	#pragma warning(disable : 4099 4127 4146)	//struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)								// libc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CSTDINT
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CMATH
+			#define HALF_ENABLE_CPP11_CMATH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_HASH
+			#define HALF_ENABLE_CPP11_HASH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CFENV
+			#define HALF_ENABLE_CPP11_CFENV 1
+		#endif
+	#endif
+#elif defined(__GLIBCXX__)									// libstdc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifdef __clang__
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#else
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#endif
+	#endif
+#elif defined(_CPPLIB_VER)									// Dinkumware/Visual C++
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+		#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
+		#define HALF_ENABLE_CPP11_HASH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
+		#define HALF_ENABLE_CPP11_CMATH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
+		#define HALF_ENABLE_CPP11_CFENV 1
+	#endif
+#endif
+#undef HALF_GCC_VERSION
+#undef HALF_ICC_VERSION
+
+// any error throwing C++ exceptions?
+#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) || defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || defined(HALF_ERRHANDLING_THROW_INEXACT)
+#define HALF_ERRHANDLING_THROWS 1
+#endif
+
+// any error handling enabled?
+#define HALF_ERRHANDLING	(HALF_ERRHANDLING_FLAGS||HALF_ERRHANDLING_ERRNO||HALF_ERRHANDLING_FENV||HALF_ERRHANDLING_THROWS)
+
+#if HALF_ERRHANDLING
+	#define HALF_UNUSED_NOERR(name) name
+#else
+	#define HALF_UNUSED_NOERR(name)
+#endif
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+	#define HALF_CONSTEXPR				constexpr
+	#define HALF_CONSTEXPR_CONST		constexpr
+	#if HALF_ERRHANDLING
+		#define HALF_CONSTEXPR_NOERR
+	#else
+		#define HALF_CONSTEXPR_NOERR	constexpr
+	#endif
+#else
+	#define HALF_CONSTEXPR
+	#define HALF_CONSTEXPR_CONST		const
+	#define HALF_CONSTEXPR_NOERR
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+	#define HALF_NOEXCEPT	noexcept
+	#define HALF_NOTHROW	noexcept
+#else
+	#define HALF_NOEXCEPT
+	#define HALF_NOTHROW	throw()
+#endif
+
+// support thread storage
+#if HALF_ENABLE_CPP11_THREAD_LOCAL
+	#define HALF_THREAD_LOCAL	thread_local
+#else
+	#define HALF_THREAD_LOCAL	static
+#endif
+
+#include <utility>
+#include <algorithm>
+#include <istream>
+#include <ostream>
+#include <limits>
+#include <stdexcept>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+	#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+	#include <cstdint>
+#endif
+#if HALF_ERRHANDLING_ERRNO
+	#include <cerrno>
+#endif
+#if HALF_ENABLE_CPP11_CFENV
+	#include <cfenv>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+	#include <functional>
+#endif
+
+
+#ifndef HALF_ENABLE_F16C_INTRINSICS
+	/// Enable F16C intruction set intrinsics.
+	/// Defining this to 1 enables the use of [F16C compiler intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between 
+	/// half-precision and single-precision values which may result in improved performance. This will not perform additional checks 
+	/// for support of the F16C instruction set, so an appropriate target platform is required when enabling this feature.
+	///
+	/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which some compilers do on supporting platforms.
+	#define HALF_ENABLE_F16C_INTRINSICS __F16C__
+#endif
+#if HALF_ENABLE_F16C_INTRINSICS
+	#include <immintrin.h>
+#endif
+
+#ifdef HALF_DOXYGEN_ONLY
+/// Type for internal floating-point computations.
+/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to override the internal 
+/// half-precision implementation to use this type for computing arithmetic operations and mathematical function (if available). 
+/// This can result in improved performance for arithmetic operators and mathematical functions but might cause results to 
+/// deviate from the specified half-precision rounding mode and inhibits proper detection of half-precision exceptions.
+#define HALF_ARITHMETIC_TYPE (undefined)
+
+/// Enable internal exception flags.
+/// Defining this to 1 causes operations on half-precision values to raise internal floating-point exception flags according to 
+/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
+#define HALF_ERRHANDLING_FLAGS	0
+
+/// Enable exception propagation to `errno`.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to 
+/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will propagate domain errors as 
+/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow errors as 
+/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be propagated.
+#define HALF_ERRHANDLING_ERRNO	0
+
+/// Enable exception propagation to built-in floating-point platform.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to the built-in 
+/// single- and double-precision implementation's exception flags using the 
+/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from `<cfenv>`. However, this 
+/// does not work in reverse and single- or double-precision exceptions will not raise the corresponding half-precision 
+/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
+#define HALF_ERRHANDLING_FENV	0
+
+/// Throw C++ exception on domain errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on domain errors.
+#define HALF_ERRHANDLING_THROW_INVALID		(undefined)
+
+/// Throw C++ exception on pole errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on pole errors.
+#define HALF_ERRHANDLING_THROW_DIVBYZERO	(undefined)
+
+/// Throw C++ exception on overflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified message on overflows.
+#define HALF_ERRHANDLING_THROW_OVERFLOW		(undefined)
+
+/// Throw C++ exception on underflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the specified message on underflows.
+#define HALF_ERRHANDLING_THROW_UNDERFLOW	(undefined)
+
+/// Throw C++ exception on rounding errors.
+/// Defining this to 1 causes operations on half-precision values to throw a 
+/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified message on general rounding errors.
+#define HALF_ERRHANDLING_THROW_INEXACT		(undefined)
+#endif
+
+#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+/// Raise INEXACT exception on overflow.
+/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT	1
+#endif
+
+#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+/// Raise INEXACT exception on underflow.
+/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+///
+/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be raised *only* when the result 
+/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact) subnormal result.
+#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT	1
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and more precise types 
+/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic operations and mathematical 
+/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes using their respective 
+/// constants or the equivalent values of 
+/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest (default)
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest representable value. It can even 
+/// be set to [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style) to synchronize 
+/// the rounding mode with that of the built-in single-precision implementation (which is likely `std::round_to_nearest`, though).
+#ifndef HALF_ROUND_STYLE
+	#define HALF_ROUND_STYLE	1		// = std::round_to_nearest
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an 
+/// operation, in particular it just evaluates to positive infinity.
+///
+/// **See also:** Documentation for [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
+#define HUGE_VALH	std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a separate 
+/// half-precision multiplication followed by an addition, which is always the case.
+///
+/// **See also:** Documentation for [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
+#define FP_FAST_FMAH	1
+
+///	Half rounding mode.
+/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode used for 
+/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
+///
+/// **See also:** Documentation for [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
+#define HLF_ROUNDS	HALF_ROUND_STYLE
+
+#ifndef FP_ILOGB0
+	#define FP_ILOGB0		INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+	#define FP_ILOGBNAN		INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+	#define FP_SUBNORMAL	0
+#endif
+#ifndef FP_ZERO
+	#define FP_ZERO			1
+#endif
+#ifndef FP_NAN
+	#define FP_NAN			2
+#endif
+#ifndef FP_INFINITE
+	#define FP_INFINITE		3
+#endif
+#ifndef FP_NORMAL
+	#define FP_NORMAL		4
+#endif
+
+#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
+	#define FE_INVALID		0x10
+	#define FE_DIVBYZERO	0x08
+	#define FE_OVERFLOW		0x04
+	#define FE_UNDERFLOW	0x02
+	#define FE_INEXACT		0x01
+	#define FE_ALL_EXCEPT	(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT)
+#endif
+
+
+/// Main namespace for half-precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+	class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	/// Library-defined half-precision literals.
+	/// Import this namespace to enable half-precision floating-point literals:
+	/// ~~~~{.cpp}
+	/// using namespace half_float::literal;
+	/// half_float::half = 4.2_h;
+	/// ~~~~
+	namespace literal
+	{
+		half operator "" _h(long double);
+	}
+#endif
+
+	/// \internal
+	/// \brief Implementation details.
+	namespace detail
+	{
+	#if HALF_ENABLE_CPP11_TYPE_TRAITS
+		/// Conditional type.
+		template<bool B,typename T,typename F> struct conditional : std::conditional<B,T,F> {};
+
+		/// Helper for tag dispatching.
+		template<bool B> struct bool_type : std::integral_constant<bool,B> {};
+		using std::true_type;
+		using std::false_type;
+
+		/// Type traits for floating-point types.
+		template<typename T> struct is_float : std::is_floating_point<T> {};
+	#else
+		/// Conditional type.
+		template<bool,typename T,typename> struct conditional { typedef T type; };
+		template<typename T,typename F> struct conditional<false,T,F> { typedef F type; };
+
+		/// Helper for tag dispatching.
+		template<bool> struct bool_type {};
+		typedef bool_type<true> true_type;
+		typedef bool_type<false> false_type;
+
+		/// Type traits for floating-point types.
+		template<typename> struct is_float : false_type {};
+		template<typename T> struct is_float<const T> : is_float<T> {};
+		template<typename T> struct is_float<volatile T> : is_float<T> {};
+		template<typename T> struct is_float<const volatile T> : is_float<T> {};
+		template<> struct is_float<float> : true_type {};
+		template<> struct is_float<double> : true_type {};
+		template<> struct is_float<long double> : true_type {};
+	#endif
+
+		/// Type traits for floating-point bits.
+		template<typename T> struct bits { typedef unsigned char type; };
+		template<typename T> struct bits<const T> : bits<T> {};
+		template<typename T> struct bits<volatile T> : bits<T> {};
+		template<typename T> struct bits<const volatile T> : bits<T> {};
+
+	#if HALF_ENABLE_CPP11_CSTDINT
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef std::uint_least16_t uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef std::uint_fast32_t uint32;
+
+		/// Fastest signed integer of (at least) 32 bits width.
+		typedef std::int_fast32_t int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> { typedef std::uint_least32_t type; };
+
+		/// Unsigned integer of (at least) 64 bits width.
+		template<> struct bits<double> { typedef std::uint_least64_t type; };
+	#else
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef unsigned short uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef unsigned long uint32;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef long int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits>=32,unsigned int,unsigned long> {};
+
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits>=64,unsigned long,unsigned long long> {};
+		#else
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> { typedef unsigned long type; };
+		#endif
+	#endif
+
+	#ifdef HALF_ARITHMETIC_TYPE
+		/// Type to use for arithmetic computations and mathematic functions internally.
+		typedef HALF_ARITHMETIC_TYPE internal_t;
+	#endif
+
+		/// Tag type for binary construction.
+		struct binary_t {};
+
+		/// Tag for binary construction.
+		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+		/// \name Implementation defined classification and arithmetic
+		/// \{
+
+		/// Check for infinity.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if infinity
+		/// \retval false else
+		template<typename T> bool builtin_isinf(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isinf(arg);
+		#elif defined(_MSC_VER)
+			return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+		#else
+			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+		#endif
+		}
+
+		/// Check for NaN.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if not a number
+		/// \retval false else
+		template<typename T> bool builtin_isnan(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isnan(arg);
+		#elif defined(_MSC_VER)
+			return ::_isnan(static_cast<double>(arg)) != 0;
+		#else
+			return arg != arg;
+		#endif
+		}
+
+		/// Check sign.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if signbit set
+		/// \retval false else
+		template<typename T> bool builtin_signbit(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::signbit(arg);
+		#else
+			return arg < T() || (arg == T() && T(1)/arg < T());
+		#endif
+		}
+
+		/// Platform-independent sign mask.
+		/// \param arg integer value in two's complement
+		/// \retval -1 if \a arg negative
+		/// \retval 0 if \a arg positive
+		inline uint32 sign_mask(uint32 arg)
+		{
+			static const int N = std::numeric_limits<uint32>::digits - 1;
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> N;
+		#else
+			return -((arg>>N)&1);
+		#endif
+		}
+
+		/// Platform-independent arithmetic right shift.
+		/// \param arg integer value in two's complement
+		/// \param i shift amount (at most 31)
+		/// \return \a arg right shifted for \a i bits with possible sign extension
+		inline uint32 arithmetic_shift(uint32 arg, int i)
+		{
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> i;
+		#else
+			return static_cast<int32>(arg)/(static_cast<int32>(1)<<i) - ((arg>>(std::numeric_limits<uint32>::digits-1))&1);
+		#endif
+		}
+
+		/// \}
+		/// \name Error handling
+		/// \{
+
+		/// Internal exception flags.
+		/// \return reference to global exception flags
+		inline int& errflags() { HALF_THREAD_LOCAL int flags = 0; return flags; }
+
+		/// Raise floating-point exception.
+		/// \param flags exceptions to raise
+		/// \param cond condition to raise exceptions for
+		inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
+		{
+		#if HALF_ERRHANDLING
+			if(!cond)
+				return;
+		#if HALF_ERRHANDLING_FLAGS
+			errflags() |= flags;
+		#endif
+		#if HALF_ERRHANDLING_ERRNO
+			if(flags & FE_INVALID)
+				errno = EDOM;
+			else if(flags & (FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW))
+				errno = ERANGE;
+		#endif
+		#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
+			std::feraiseexcept(flags);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INVALID
+			if(flags & FE_INVALID)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
+			if(flags & FE_DIVBYZERO)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
+			if(flags & FE_OVERFLOW)
+				throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
+			if(flags & FE_UNDERFLOW)
+				throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INEXACT
+			if(flags & FE_INEXACT)
+				throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
+				raise(FE_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+			if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
+				raise(FE_INEXACT);
+		#endif
+		#endif
+		}
+
+		/// Check and signal for any NaN.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \retval true if either \a x or \a y is NaN
+		/// \retval false else
+		/// \exception FE_INVALID if \a x or \a y is NaN
+		inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, (x&0x7FFF)>0x7C00 || (y&0x7FFF)>0x7C00);
+		#endif
+			return (x&0x7FFF) > 0x7C00 || (y&0x7FFF) > 0x7C00;
+		}
+
+		/// Signal and silence signaling NaN.
+		/// \param nan half-precision NaN value
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a nan is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, !(nan&0x200));
+		#endif
+			return nan | 0x200;
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x or \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : (y|0x200);
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \param z third half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)) || ((z&0x7FFF)>0x7C00 && !(z&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : ((y&0x7FFF)>0x7C00) ? (y|0x200) : (z|0x200);
+		}
+
+		/// Select value or signaling NaN.
+		/// \param x preferred half-precision value
+		/// \param y ignored half-precision value except for signaling NaN
+		/// \return \a y if signaling NaN, \a x otherwise
+		/// \exception FE_INVALID if \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
+		{
+		#if HALF_ERRHANDLING
+			return (((y&0x7FFF)>0x7C00) && !(y&0x200)) ? signal(y) : x;
+		#else
+			return x;
+		#endif
+		}
+
+		/// Raise domain error and return NaN.
+		/// return quiet NaN
+		/// \exception FE_INVALID
+		inline HALF_CONSTEXPR_NOERR unsigned int invalid()
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_INVALID);
+		#endif
+			return 0x7FFF;
+		}
+
+		/// Raise pole error and return infinity.
+		/// \param sign half-precision value with sign bit only
+		/// \return half-precision infinity with sign of \a sign
+		/// \exception FE_DIVBYZERO
+		inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_DIVBYZERO);
+		#endif
+			return sign | 0x7C00;
+		}
+
+		/// Check value for underflow.
+		/// \param arg non-zero half-precision value to check
+		/// \return \a arg
+		/// \exception FE_UNDERFLOW if arg is subnormal
+		inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
+		{
+		#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			raise(FE_UNDERFLOW, !(arg&0x7C00));
+		#endif
+			return arg;
+		}
+
+		/// \}
+		/// \name Conversion and rounding
+		/// \{
+
+		/// Half-precision overflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded overflowing half-precision value
+		/// \exception FE_OVERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_OVERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+0x7C00-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+0x7BFF+(sign>>15)) :
+					(R==std::round_toward_zero) ? (sign|0x7BFF) :
+					(sign|0x7C00);
+		}
+
+		/// Half-precision underflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded underflowing half-precision value
+		/// \exception FE_UNDERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			raise(FE_UNDERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+1-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+(sign>>15)) :
+					sign;
+		}
+
+		/// Round half-precision number.
+		/// \tparam R rounding mode to use
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param value finite half-precision number to round
+		/// \param g guard bit (most significant discarded bit)
+		/// \param s sticky bit (or of all but the most significant discarded bits)
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,bool I> HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
+		{
+		#if HALF_ERRHANDLING
+			value +=	(R==std::round_to_nearest) ? (g&(s|value)) :
+						(R==std::round_toward_infinity) ? (~(value>>15)&(g|s)) :
+						(R==std::round_toward_neg_infinity) ? ((value>>15)&(g|s)) : 0;
+			if((value&0x7C00) == 0x7C00)
+				raise(FE_OVERFLOW);
+			else if(value & 0x7C00)
+				raise(FE_INEXACT, I || (g|s)!=0);
+			else
+				raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g|s)!=0);
+			return value;
+		#else
+			return	(R==std::round_to_nearest) ? (value+(g&(s|value))) :
+					(R==std::round_toward_infinity) ? (value+(~(value>>15)&(g|s))) :
+					(R==std::round_toward_neg_infinity) ? (value+((value>>15)&(g|s))) :
+					value;
+		#endif
+		}
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \param value half-precision value to round
+		/// \return half-precision bits for nearest integral value
+		/// \exception FE_INVALID for signaling NaN
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I> unsigned int integral(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs < 0x3C00)
+			{
+				raise(FE_INEXACT, I);
+				return ((R==std::round_to_nearest) ? (0x3C00&-static_cast<unsigned>(abs>=(0x3800+E))) :
+						(R==std::round_toward_infinity) ? (0x3C00&-(~(value>>15)&(abs!=0))) :
+						(R==std::round_toward_neg_infinity) ? (0x3C00&-static_cast<unsigned>(value>0x8000)) :
+						0) | (value&0x8000);
+			}
+			if(abs >= 0x6400)
+				return (abs>0x7C00) ? signal(value) : value;
+			unsigned int exp = 25 - (abs>>10), mask = (1<<exp) - 1;
+			raise(FE_INEXACT, I && (value&mask));
+			return ((	(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(value>>exp)&E)) :
+						(R==std::round_toward_infinity) ? (mask&((value>>15)-1)) :
+						(R==std::round_toward_neg_infinity) ? (mask&-(value>>15)) :
+						0) + value) & ~mask;
+		}
+
+		/// Convert fixed point to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam F number of fractional bits in [11,31]
+		/// \tparam S `true` for signed, `false` for unsigned
+		/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param m mantissa in Q1.F fixed point format
+		/// \param exp biased exponent - 1
+		/// \param sign half-precision value with sign bit only
+		/// \param s sticky bit (or of all but the most significant already discarded bits)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,unsigned int F,bool S,bool N,bool I> unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
+		{
+			if(S)
+			{
+				uint32 msign = sign_mask(m);
+				m = (m^msign) - msign;
+				sign = msign & 0x8000;
+			}
+			if(N)
+				for(; m<(static_cast<uint32>(1)<<F) && exp; m<<=1,--exp) ;
+			else if(exp < 0)
+				return rounded<R,I>(sign+(m>>(F-10-exp)), (m>>(F-11-exp))&1, s|((m&((static_cast<uint32>(1)<<(F-11-exp))-1))!=0));
+			return rounded<R,I>(sign+(exp<<10)+(m>>(F-10)), (m>>(F-11))&1, s|((m&((static_cast<uint32>(1)<<(F-11))-1))!=0));
+		}
+
+		/// Convert IEEE single-precision to half-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \tparam R rounding mode to use
+		/// \param value single-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(float value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
+				(R==std::round_to_nearest) ? _MM_FROUND_TO_NEAREST_INT :
+				(R==std::round_toward_zero) ? _MM_FROUND_TO_ZERO :
+				(R==std::round_toward_infinity) ? _MM_FROUND_TO_POS_INF :
+				(R==std::round_toward_neg_infinity) ? _MM_FROUND_TO_NEG_INF :
+				_MM_FROUND_CUR_DIRECTION));
+		#else
+			bits<float>::type fbits;
+			std::memcpy(&fbits, &value, sizeof(float));
+		#if 1
+			unsigned int sign = (fbits>>16) & 0x8000;
+			fbits &= 0x7FFFFFFF;
+			if(fbits >= 0x7F800000)
+				return sign | 0x7C00 | ((fbits>0x7F800000) ? (0x200|((fbits>>13)&0x3FF)) : 0);
+			if(fbits >= 0x47800000)
+				return overflow<R>(sign);
+			if(fbits >= 0x38800000)
+				return rounded<R,false>(sign|(((fbits>>23)-112)<<10)|((fbits>>13)&0x3FF), (fbits>>12)&1, (fbits&0xFFF)!=0);
+			if(fbits >= 0x33000000)
+			{
+				int i = 125 - (fbits>>23);
+				fbits = (fbits&0x7FFFFF) | 0x800000;
+				return rounded<R,false>(sign|(fbits>>(i+1)), (fbits>>i)&1, (fbits&((static_cast<uint32>(1)<<i)-1))!=0);
+			}
+			if(fbits != 0)
+				return underflow<R>(sign);
+			return sign;
+		#else
+			static const uint16 base_table[512] = {
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 
+				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 
+				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7C00, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 
+				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 
+				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00 };
+			static const unsigned char shift_table[256] = {
+				24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+			int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
+			fbits &= 0x7FFFFF;
+			uint32 m = (fbits|((exp!=0)<<23)) & -static_cast<uint32>(exp!=0xFF);
+			return rounded<R,false>(base_table[sexp]+(fbits>>i), (m>>(i-1))&1, (((static_cast<uint32>(1)<<(i-1))-1)&m)!=0);
+		#endif
+		#endif
+		}
+
+		/// Convert IEEE double-precision to half-precision.
+		/// \tparam R rounding mode to use
+		/// \param value double-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(double value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			if(R == std::round_indeterminate)
+				return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
+		#endif
+			bits<double>::type dbits;
+			std::memcpy(&dbits, &value, sizeof(double));
+			uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
+			unsigned int sign = (hi>>16) & 0x8000;
+			hi &= 0x7FFFFFFF;
+			if(hi >= 0x7FF00000)
+				return sign | 0x7C00 | ((dbits&0xFFFFFFFFFFFFF) ? (0x200|((hi>>10)&0x3FF)) : 0);
+			if(hi >= 0x40F00000)
+				return overflow<R>(sign);
+			if(hi >= 0x3F100000)
+				return rounded<R,false>(sign|(((hi>>20)-1008)<<10)|((hi>>10)&0x3FF), (hi>>9)&1, ((hi&0x1FF)|lo)!=0);
+			if(hi >= 0x3E600000)
+			{
+				int i = 1018 - (hi>>20);
+				hi = (hi&0xFFFFF) | 0x100000;
+				return rounded<R,false>(sign|(hi>>(i+1)), (hi>>i)&1, ((hi&((static_cast<uint32>(1)<<i)-1))|lo)!=0);
+			}
+			if((hi|lo) != 0)
+				return underflow<R>(sign);
+			return sign;
+		}
+
+		/// Convert non-IEEE floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half_impl(T value, ...)
+		{
+			unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+			if(value == T())
+				return hbits;
+			if(builtin_isnan(value))
+				return hbits | 0x7FFF;
+			if(builtin_isinf(value))
+				return hbits | 0x7C00;
+			int exp;
+			std::frexp(value, &exp);
+			if(exp > 16)
+				return overflow<R>(hbits);
+			if(exp < -13)
+				value = std::ldexp(value, 25);
+			else
+			{
+				value = std::ldexp(value, 12-exp);
+				hbits |= ((exp+13)<<10);
+			}
+			T ival, frac = std::modf(value, &ival);
+			int m = std::abs(static_cast<int>(ival));
+			return rounded<R,false>(hbits+(m>>1), m&1, frac!=T());
+		}
+
+		/// Convert floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half(T value)
+		{
+			return float2half_impl<R>(value, bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert integer to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value integral value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int int2half(T value)
+		{
+			unsigned int bits = static_cast<unsigned>(value<0) << 15;
+			if(!value)
+				return bits;
+			if(bits)
+				value = -value;
+			if(value > 0xFFFF)
+				return overflow<R>(bits);
+			unsigned int m = static_cast<unsigned int>(value), exp = 24;
+			for(; m<0x400; m<<=1,--exp) ;
+			for(; m>0x7FF; m>>=1,++exp) ;
+			bits |= (exp<<10) + m;
+			return (exp>24) ? rounded<R,false>(bits, (value>>(exp-25))&1, (((1<<(exp-25))-1)&value)!=0) : bits;
+		}
+
+		/// Convert half-precision to IEEE single-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \param value half-precision value to convert
+		/// \return single-precision value
+		inline float half2float_impl(unsigned int value, float, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
+		#else
+		#if 0
+			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
+				fbits += static_cast<bits<float>::type>(abs) << 13;
+			}
+		#else
+			static const bits<float>::type mantissa_table[2048] = {
+				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 
+				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 
+				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 
+				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 
+				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 
+				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 
+				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 
+				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 
+				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 
+				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 
+				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 
+				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 
+				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 
+				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 
+				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 
+				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 
+				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 
+				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 
+				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 
+				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 
+				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 
+				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 
+				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 
+				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 
+				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 
+				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 
+				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 
+				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 
+				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 
+				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 
+				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 
+				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 
+				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 
+				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 
+				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 
+				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 
+				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 
+				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 
+				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 
+				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 
+				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 
+				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 
+				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 
+				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 
+				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 
+				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 
+				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 
+				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 
+				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 
+				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 
+				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 
+				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 
+				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 
+				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 
+				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 
+				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 
+				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 
+				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 
+				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 
+				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 
+				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 
+				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 
+				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 
+				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 
+				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 
+				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 
+				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 
+				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 
+				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 
+				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 
+				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 
+				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 
+				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 
+				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 
+				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 
+				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 
+				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 
+				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 
+				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 
+				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 
+				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 
+				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 
+				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 
+				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 
+				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 
+				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 
+				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 
+				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 
+				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 
+				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 
+				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 
+				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 
+				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 
+				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 
+				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 
+				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 
+				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 
+				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 
+				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 
+				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 
+				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 
+				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 
+				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 
+				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 
+				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 
+				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 
+				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 
+				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 
+				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 
+				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 
+				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 
+				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 
+				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 
+				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 
+				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 
+				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 
+				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 
+				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 
+				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 
+				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 
+				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 
+				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 
+				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 
+				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 
+				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 
+				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 
+				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 
+				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+			static const bits<float>::type exponent_table[64] = {
+				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 
+				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 
+				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 
+				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+			static const unsigned short offset_table[64] = {
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+			bits<float>::type fbits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
+		#endif
+			float out;
+			std::memcpy(&out, &fbits, sizeof(float));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to IEEE double-precision.
+		/// \param value half-precision value to convert
+		/// \return double-precision value
+		inline double half2float_impl(unsigned int value, double, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
+		#else
+			uint32 hi = static_cast<uint32>(value&0x8000) << 16;
+			unsigned int abs = value & 0x7FFF;
+			if(abs)
+			{
+				hi |= 0x3F000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,hi-=0x100000) ;
+				hi += static_cast<uint32>(abs) << 10;
+			}
+			bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
+			double out;
+			std::memcpy(&out, &dbits, sizeof(double));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to non-IEEE floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float_impl(unsigned int value, T, ...)
+		{
+			T out;
+			unsigned int abs = value & 0x7FFF;
+			if(abs > 0x7C00)
+				out = (std::numeric_limits<T>::has_signaling_NaN && !(abs&0x200)) ? std::numeric_limits<T>::signaling_NaN() :
+					std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+			else if(abs == 0x7C00)
+				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+			else if(abs > 0x3FF)
+				out = std::ldexp(static_cast<T>((abs&0x3FF)|0x400), (abs>>10)-25);
+			else
+				out = std::ldexp(static_cast<T>(abs), -24);
+			return (value&0x8000) ? -out : out;
+		}
+
+		/// Convert half-precision to floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float(unsigned int value)
+		{
+			return half2float_impl(value, T(), bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert half-precision floating-point to integer.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value half-precision value to convert
+		/// \return rounded integer value
+		/// \exception FE_INVALID if value is not representable in type \a T
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I,typename T> T half2int(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs >= 0x7C00)
+			{
+				raise(FE_INVALID);
+				return (value&0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+			}
+			if(abs < 0x3800)
+			{
+				raise(FE_INEXACT, I);
+				return	(R==std::round_toward_infinity) ? T(~(value>>15)&(abs!=0)) :
+						(R==std::round_toward_neg_infinity) ? -T(value>0x8000) :
+						T();
+			}
+			int exp = 25 - (abs>>10);
+			unsigned int m = (value&0x3FF) | 0x400;
+			int32 i = static_cast<int32>((exp<=0) ? (m<<-exp) : ((m+(
+				(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(m>>exp)&E)) :
+				(R==std::round_toward_infinity) ? (((1<<exp)-1)&((value>>15)-1)) :
+				(R==std::round_toward_neg_infinity) ? (((1<<exp)-1)&-(value>>15)) : 0))>>exp));
+			if((!std::numeric_limits<T>::is_signed && (value&0x8000)) || (std::numeric_limits<T>::digits<16 &&
+				((value&0x8000) ? (-i<std::numeric_limits<T>::min()) : (i>std::numeric_limits<T>::max()))))
+				raise(FE_INVALID);
+			else if(I && exp > 0 && (m&((1<<exp)-1)))
+				raise(FE_INEXACT);
+			return static_cast<T>((value&0x8000) ? -i : i);
+		}
+
+		/// \}
+		/// \name Mathematics
+		/// \{
+
+		/// upper part of 64-bit multiplication.
+		/// \tparam R rounding mode to use
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y
+		template<std::float_round_style R> uint32 mulhi(uint32 x, uint32 y)
+		{
+			uint32 xy = (x>>16) * (y&0xFFFF), yx = (x&0xFFFF) * (y>>16), c = (xy&0xFFFF) + (yx&0xFFFF) + (((x&0xFFFF)*(y&0xFFFF))>>16);
+			return (x>>16)*(y>>16) + (xy>>16) + (yx>>16) + (c>>16) +
+				((R==std::round_to_nearest) ? ((c>>15)&1) : (R==std::round_toward_infinity) ? ((c&0xFFFF)!=0) : 0);
+		}
+
+		/// 64-bit multiplication.
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y rounded to nearest
+		inline uint32 multiply64(uint32 x, uint32 y)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			return static_cast<uint32>((static_cast<unsigned long long>(x)*static_cast<unsigned long long>(y)+0x80000000)>>32);
+		#else
+			return mulhi<std::round_to_nearest>(x, y);
+		#endif
+		}
+
+		/// 64-bit division.
+		/// \param x upper 32 bit of dividend
+		/// \param y divisor
+		/// \param s variable to store sticky bit for rounding
+		/// \return (\a x << 32) / \a y
+		inline uint32 divide64(uint32 x, uint32 y, int &s)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long xx = static_cast<unsigned long long>(x) << 32;
+			return s = (xx%y!=0), static_cast<uint32>(xx/y);
+		#else
+			y >>= 1;
+			uint32 rem = x, div = 0;
+			for(unsigned int i=0; i<32; ++i)
+			{
+				div <<= 1;
+				if(rem >= y)
+				{
+					rem -= y;
+					div |= 1;
+				}
+				rem <<= 1;
+			}
+			return s = rem > 1, div;
+		#endif
+		}
+
+		/// Half precision positive modulus.
+		/// \tparam Q `true` to compute full quotient, `false` else
+		/// \tparam R `true` to compute signed remainder, `false` for positive remainder
+		/// \param x first operand as positive finite half-precision value
+		/// \param y second operand as positive finite half-precision value
+		/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+		/// \return modulus of \a x / \a y
+		template<bool Q,bool R> unsigned int mod(unsigned int x, unsigned int y, int *quo = NULL)
+		{
+			unsigned int q = 0;
+			if(x > y)
+			{
+				int absx = x, absy = y, expx = 0, expy = 0;
+				for(; absx<0x400; absx<<=1,--expx) ;
+				for(; absy<0x400; absy<<=1,--expy) ;
+				expx += absx >> 10;
+				expy += absy >> 10;
+				int mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+				for(int d=expx-expy; d; --d)
+				{
+					if(!Q && mx == my)
+						return 0;
+					if(mx >= my)
+					{
+						mx -= my;
+						q += Q;
+					}
+					mx <<= 1;
+					q <<= static_cast<int>(Q);
+				}
+				if(!Q && mx == my)
+					return 0;
+				if(mx >= my)
+				{
+					mx -= my;
+					++q;
+				}
+				if(Q)
+				{
+					q &= (1<<(std::numeric_limits<int>::digits-1)) - 1;
+					if(!mx)
+						return *quo = q, 0;
+				}
+				for(; mx<0x400; mx<<=1,--expy) ;
+				x = (expy>0) ? ((expy<<10)|(mx&0x3FF)) : (mx>>(1-expy));
+			}
+			if(R)
+			{
+				unsigned int a, b;
+				if(y < 0x800)
+				{
+					a = (x<0x400) ? (x<<1) : (x+0x400);
+					b = y;
+				}
+				else
+				{
+					a = x;
+					b = y - 0x400;
+				}
+				if(a > b || (a == b && (q&1)))
+				{
+					int exp = (y>>10) + (y<=0x3FF), d = exp - (x>>10) - (x<=0x3FF);
+					int m = (((y&0x3FF)|((y>0x3FF)<<10))<<1) - (((x&0x3FF)|((x>0x3FF)<<10))<<(1-d));
+					for(; m<0x800 && exp>1; m<<=1,--exp) ;
+					x = 0x8000 + ((exp-1)<<10) + (m>>1);
+					q += Q;
+				}
+			}
+			if(Q)
+				*quo = q;
+			return x;
+		}
+
+		/// Fixed point square root.
+		/// \tparam F number of fractional bits
+		/// \param r radicand in Q1.F fixed point format
+		/// \param exp exponent
+		/// \return square root as Q1.F/2
+		template<unsigned int F> uint32 sqrt(uint32 &r, int &exp)
+		{
+			int i = exp & 1;
+			r <<= i;
+			exp = (exp-i) / 2;
+			uint32 m = 0;
+			for(uint32 bit=static_cast<uint32>(1)<<F; bit; bit>>=2)
+			{
+				if(r < m+bit)
+					m >>= 1;
+				else
+				{
+					r -= m + bit;
+					m = (m>>1) + bit;
+				}
+			}
+			return m;
+		}
+
+		/// Fixed point binary exponential.
+		/// This uses the BKM algorithm in E-mode.
+		/// \param m exponent in [0,1) as Q0.31
+		/// \param n number of iterations (at most 32)
+		/// \return 2 ^ \a m as Q1.31
+		inline uint32 exp2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(!m)
+				return 0x80000000;
+			uint32 mx = 0x80000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = my + logs[i];
+				if(mz <= m)
+				{
+					my = mz;
+					mx += mx >> i;
+				}
+			}
+			return mx;
+		}
+
+		/// Fixed point binary logarithm.
+		/// This uses the BKM algorithm in L-mode.
+		/// \param m mantissa in [1,2) as Q1.30
+		/// \param n number of iterations (at most 32)
+		/// \return log2(\a m) as Q0.31
+		inline uint32 log2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(m == 0x40000000)
+				return 0;
+			uint32 mx = 0x40000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = mx + (mx>>i);
+				if(mz <= m)
+				{
+					mx = mz;
+					my += logs[i];
+				}
+			}
+			return my;
+		}
+
+		/// Fixed point sine and cosine.
+		/// This uses the CORDIC algorithm in rotation mode.
+		/// \param mz angle in [-pi/2,pi/2] as Q1.30
+		/// \param n number of iterations (at most 31)
+		/// \return sine and cosine of \a mz as Q1.30
+		inline std::pair<uint32,uint32> sincos(uint32 mz, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mx = 0x26DD3B6A, my = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(mz);
+				uint32 tx = mx - (arithmetic_shift(my, i)^sign) + sign;
+				uint32 ty = my + (arithmetic_shift(mx, i)^sign) - sign;
+				mx = tx; my = ty; mz -= (angles[i]^sign) - sign;
+			}
+			return std::make_pair(my, mx);
+		}
+
+		/// Fixed point arc tangent.
+		/// This uses the CORDIC algorithm in vectoring mode.
+		/// \param my y coordinate as Q0.30
+		/// \param mx x coordinate as Q0.30
+		/// \param n number of iterations (at most 31)
+		/// \return arc tangent of \a my / \a mx as Q1.30
+		inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mz = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(my);
+				uint32 tx = mx + (arithmetic_shift(my, i)^sign) - sign;
+				uint32 ty = my - (arithmetic_shift(mx, i)^sign) + sign;
+				mx = tx; my = ty; mz += (angles[i]^sign) - sign;
+			}
+			return mz;
+		}
+
+		/// Reduce argument for trigonometric functions.
+		/// \param abs half-precision floating-point value
+		/// \param k value to take quarter period
+		/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
+		inline uint32 angle_arg(unsigned int abs, int &k)
+		{
+			uint32 m = (abs&0x3FF) | ((abs>0x3FF)<<10);
+			int exp = (abs>>10) + (abs<=0x3FF) - 15;
+			if(abs < 0x3A48)
+				return k = 0, m << (exp+20);
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL<<(62-exp)) - 1, yi = (y+(mask>>1)) & ~mask, f = y - yi;
+			uint32 sign = -static_cast<uint32>(f>>63);
+			k = static_cast<int>(yi>>(62-exp));
+			return (multiply64(static_cast<uint32>((sign ? -f : f)>>(31-exp)), 0xC90FDAA2)^sign) - sign;
+		#else
+			uint32 yh = m*0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442), yl = (m*0x36E4E442) & 0xFFFFFFFF;
+			uint32 mask = (static_cast<uint32>(1)<<(30-exp)) - 1, yi = (yh+(mask>>1)) & ~mask, sign = -static_cast<uint32>(yi>yh);
+			k = static_cast<int>(yi>>(30-exp));
+			uint32 fh = (yh^sign) + (yi^~sign) - ~sign, fl = (yl^sign) - sign;
+			return (multiply64((exp>-1) ? (((fh<<(1+exp))&0xFFFFFFFF)|((fl&0xFFFFFFFF)>>(31-exp))) : fh, 0xC90FDAA2)^sign) - sign;
+		#endif
+		}
+
+		/// Get arguments for atan2 function.
+		/// \param abs half-precision floating-point value
+		/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
+		inline std::pair<uint32,uint32> atan2_args(unsigned int abs)
+		{
+			int exp = -15;
+			for(; abs<0x400; abs<<=1,--exp) ;
+			exp += abs >> 10;
+			uint32 my = ((abs&0x3FF)|0x400) << 5, r = my * my;
+			int rexp = 2 * exp;
+			r = 0x40000000 - ((rexp>-31) ? ((r>>-rexp)|((r&((static_cast<uint32>(1)<<-rexp)-1))!=0)) : 1);
+			for(rexp=0; r<0x40000000; r<<=1,--rexp) ;
+			uint32 mx = sqrt<30>(r, rexp);
+			int d = exp - rexp;
+			if(d < 0)
+				return std::make_pair((d<-14) ? ((my>>(-d-14))+((my>>(-d-15))&1)) : (my<<(14+d)), (mx<<14)+(r<<13)/mx);
+			if(d > 0)
+				return std::make_pair(my<<14, (d>14) ? ((mx>>(d-14))+((mx>>(d-15))&1)) : ((d==14) ? mx : ((mx<<(14-d))+(r<<(13-d))/mx)));
+			return std::make_pair(my<<13, (mx<<13)+(r<<12)/mx);
+		}
+
+		/// Get exponentials for hyperbolic computation
+		/// \param abs half-precision floating-point value
+		/// \param exp variable to take unbiased exponent of larger result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
+		inline std::pair<uint32,uint32> hyperbolic_args(unsigned int abs, int &exp, unsigned int n = 32)
+		{
+			uint32 mx = detail::multiply64(static_cast<uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29), my;
+			int e = (abs>>10) + (abs<=0x3FF);
+			if(e < 14)
+			{
+				exp = 0;
+				mx >>= 14 - e;
+			}
+			else
+			{
+				exp = mx >> (45-e);
+				mx = (mx<<(e-14)) & 0x7FFFFFFF;
+			}
+			mx = exp2(mx, n);
+			int d = exp << 1, s;
+			if(mx > 0x80000000)
+			{
+				my = divide64(0x80000000, mx, s);
+				my |= s;
+				++d;
+			}
+			else
+				my = mx;
+			return std::make_pair(mx, (d<31) ? ((my>>d)|((my&((static_cast<uint32>(1)<<d)-1))!=0)) : 1);
+		}
+
+		/// Postprocessing for binary exponential.
+		/// \tparam R rounding mode to use
+		/// \param m fractional part of as Q0.31
+		/// \param exp absolute value of unbiased exponent
+		/// \param esign sign of actual exponent
+		/// \param sign sign bit of result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R> unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0, unsigned int n = 32)
+		{
+			if(esign)
+			{
+				exp = -exp - (m!=0);
+				if(exp < -25)
+					return underflow<R>(sign);
+				else if(exp == -25)
+					return rounded<R,false>(sign, 1, m!=0);
+			}
+			else if(exp > 15)
+				return overflow<R>(sign);
+			if(!m)
+				return sign | (((exp+=15)>0) ? (exp<<10) : check_underflow(0x200>>-exp));
+			m = exp2(m, n);
+			int s = 0;
+			if(esign)
+				m = divide64(0x80000000, m, s);
+			return fixed2half<R,31,false,false,true>(m, exp+14, sign, s);
+		}
+
+		/// Postprocessing for binary logarithm.
+		/// \tparam R rounding mode to use
+		/// \tparam L logarithm for base transformation as Q1.31
+		/// \param m fractional part of logarithm as Q0.31
+		/// \param ilog signed integer part of logarithm
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return value base-transformed and converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,uint32 L> unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
+		{
+			uint32 msign = sign_mask(ilog);
+			m = (((static_cast<uint32>(ilog)<<27)+(m>>4))^msign) - msign;
+			if(!m)
+				return 0;
+			for(; m<0x80000000; m<<=1,--exp) ;
+			int i = m >= L, s;
+			exp += i;
+			m >>= 1 + i;
+			sign ^= msign & 0x8000;
+			if(exp < -11)
+				return underflow<R>(sign);
+			m = divide64(m, L, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, 1);
+		}
+
+		/// Hypotenuse square root and postprocessing.
+		/// \tparam R rounding mode to use
+		/// \param r mantissa as Q2.30
+		/// \param exp biased exponent
+		/// \return square root converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int hypot_post(uint32 r, int exp)
+		{
+			int i = r >> 31;
+			if((exp+=i) > 46)
+				return overflow<R>();
+			if(exp < -34)
+				return underflow<R>();
+			r = (r>>i) | (r&i);
+			uint32 m = sqrt<30>(r, exp+=15);
+			return fixed2half<R,15,false,false,false>(m, exp-1, 0, r!=0);
+		}
+
+		/// Division and postprocessing for tangents.
+		/// \tparam R rounding mode to use
+		/// \param my dividend as Q1.31
+		/// \param mx divisor as Q1.31
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return quotient converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R> unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
+		{
+			int i = my >= mx, s;
+			exp += i;
+			if(exp > 29)
+				return overflow<R>(sign);
+			if(exp < -11)
+				return underflow<R>(sign);
+			uint32 m = divide64(my>>(i+1), mx, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, s);
+		}
+
+		/// Area function and postprocessing.
+		/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) = log(x+sqrt(x^2+|-1))`.
+		/// \tparam R rounding mode to use
+		/// \tparam S `true` for asinh, `false` for acosh
+		/// \param arg half-precision argument
+		/// \return asinh|acosh(\a arg) converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool S> unsigned int area(unsigned int arg)
+		{
+			int abs = arg & 0x7FFF, expx = (abs>>10) + (abs<=0x3FF) - 15, expy = -15, ilog, i;
+			uint32 mx = static_cast<uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << 20, my, r;
+			for(; abs<0x400; abs<<=1,--expy) ;
+			expy += abs >> 10;
+			r = ((abs&0x3FF)|0x400) << 5;
+			r *= r;
+			i = r >> 31;
+			expy = 2*expy + i;
+			r >>= i;
+			if(S)
+			{
+				if(expy < 0)
+				{
+					r = 0x40000000 + ((expy>-30) ? ((r>>-expy)|((r&((static_cast<uint32>(1)<<-expy)-1))!=0)) : 1);
+					expy = 0;
+				}
+				else
+				{
+					r += 0x40000000 >> expy;
+					i = r >> 31;
+					r = (r>>i) | (r&i);
+					expy += i;
+				}
+			}
+			else
+			{
+				r -= 0x40000000 >> expy;
+				for(; r<0x40000000; r<<=1,--expy) ;
+			}
+			my = sqrt<30>(r, expy);
+			my = (my<<15) + (r<<14)/my;
+			if(S)
+			{
+				mx >>= expy - expx;
+				ilog = expy;
+			}
+			else
+			{
+				my >>= expx - expy;
+				ilog = expx;
+			}
+			my += mx;
+			i = my >> 31;
+			static const int G = S && (R==std::round_to_nearest);
+			return log2_post<R,0xB8AA3B2A>(log2(my>>i, 26+S+G)+(G<<3), ilog+i, 17, arg&(static_cast<unsigned>(S)<<15));
+		}
+
+		/// Class for 1.31 unsigned floating-point computation
+		struct f31
+		{
+			/// Constructor.
+			/// \param mant mantissa as 1.31
+			/// \param e exponent
+			HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
+
+			/// Constructor.
+			/// \param abs unsigned half-precision value
+			f31(unsigned int abs) : exp(-15)
+			{
+				for(; abs<0x400; abs<<=1,--exp) ;
+				m = static_cast<uint32>((abs&0x3FF)|0x400) << 21;
+				exp += (abs>>10);
+			}
+
+			/// Addition operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a + \a b
+			friend f31 operator+(f31 a, f31 b)
+			{
+				if(b.exp > a.exp)
+					std::swap(a, b);
+				int d = a.exp - b.exp;
+				uint32 m = a.m + ((d<32) ? (b.m>>d) : 0);
+				int i = (m&0xFFFFFFFF) < a.m;
+				return f31(((m+i)>>i)|0x80000000, a.exp+i);
+			}
+
+			/// Subtraction operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a - \a b
+			friend f31 operator-(f31 a, f31 b)
+			{
+				int d = a.exp - b.exp, exp = a.exp;
+				uint32 m = a.m - ((d<32) ? (b.m>>d) : 0);
+				if(!m)
+					return f31(0, -32);
+				for(; m<0x80000000; m<<=1,--exp) ;
+				return f31(m, exp);
+			}
+
+			/// Multiplication operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a * \a b
+			friend f31 operator*(f31 a, f31 b)
+			{
+				uint32 m = multiply64(a.m, b.m);
+				int i = m >> 31;
+				return f31(m<<(1-i), a.exp + b.exp + i);
+			}
+
+			/// Division operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a / \a b
+			friend f31 operator/(f31 a, f31 b)
+			{
+				int i = a.m >= b.m, s;
+				uint32 m = divide64((a.m+i)>>i, b.m, s);
+				return f31(m, a.exp - b.exp + i - 1);
+			}
+
+			uint32 m;			///< mantissa as 1.31.
+			int exp;			///< exponent.
+		};
+
+		/// Error function and postprocessing.
+		/// This computes the value directly in Q1.31 using the approximations given 
+		/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
+		/// \tparam R rounding mode to use
+		/// \tparam C `true` for comlementary error function, `false` else
+		/// \param arg half-precision function argument
+		/// \return approximated value of error function in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool C> unsigned int erf(unsigned int arg)
+		{
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			f31 x(abs), x2 = x * x * f31(0xB8AA3B29, 0), t = f31(0x80000000, 0) / (f31(0x80000000, 0)+f31(0xA7BA054A, -2)*x), t2 = t * t;
+			f31 e = ((f31(0x87DC2213, 0)*t2+f31(0xB5F0E2AE, 0))*t2+f31(0x82790637, -2)-(f31(0xBA00E2B8, 0)*t2+f31(0x91A98E62, -2))*t) * t /
+					((x2.exp<0) ? f31(exp2((x2.exp>-32) ? (x2.m>>-x2.exp) : 0, 30), 0) : f31(exp2((x2.m<<x2.exp)&0x7FFFFFFF, 22), x2.m>>(31-x2.exp)));
+			return (!C || sign) ? fixed2half<R,31,false,true,true>(0x80000000-(e.m>>(C-e.exp)), 14+C, sign&(C-1U)) :
+					(e.exp<-25) ? underflow<R>() : fixed2half<R,30,false,false,true>(e.m>>1, e.exp+14, 0, e.m&1);
+		}
+
+		/// Gamma function and postprocessing.
+		/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
+		/// \tparam R rounding mode to use
+		/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
+		/// \param arg half-precision floating-point value
+		/// \return lgamma/tgamma(\a arg) in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if \a arg is not a positive integer
+		template<std::float_round_style R,bool L> unsigned int gamma(unsigned int arg)
+		{
+/*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544, -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837, 0.0114684895434781459556 };
+			double t = arg + 4.65, s = p[0];
+			for(unsigned int i=0; i<5; ++i)
+				s += p[i+1] / (arg+i);
+			return std::log(s) + (arg-0.5)*std::log(t) - t;
+*/			static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			bool bsign = sign != 0;
+			f31 z(abs), x = sign ? (z+f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2), s =
+				f31(0xA06C9901, 1) + f31(0xBBE654E2, -7)/(x+f31(0x80000000, 2)) + f31(0xA1CE6098, 6)/(x+f31(0x80000000, 1))
+				+ f31(0xE1868CB7, 7)/x - f31(0x8625E279, 8)/(x+f31(0x80000000, 0)) - f31(0xA03E158F, 2)/(x+f31(0xC0000000, 1));
+			int i = (s.exp>=2) + (s.exp>=4) + (s.exp>=8) + (s.exp>=16);
+			s = f31((static_cast<uint32>(s.exp)<<(31-i))+(log2(s.m>>1, 28)>>i), i) / lbe;
+			if(x.exp != -1 || x.m != 0x80000000)
+			{
+				i = (t.exp>=2) + (t.exp>=4) + (t.exp>=8);
+				f31 l = f31((static_cast<uint32>(t.exp)<<(31-i))+(log2(t.m>>1, 30)>>i), i) / lbe;
+				s = (x.exp<-1) ? (s-(f31(0x80000000, -1)-x)*l) : (s+(x-f31(0x80000000, -1))*l);
+			}
+			s = x.exp ? (s-t) : (t-s);
+			if(bsign)
+			{
+				if(z.exp >= 0)
+				{
+					sign &= (L|((z.m>>(31-z.exp))&1)) - 1;
+					for(z=f31((z.m<<(1+z.exp))&0xFFFFFFFF, -1); z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				if(z.exp == -1)
+					z = f31(0x80000000, 0) - z;
+				if(z.exp < -1)
+				{
+					z = z * pi;
+					z.m = sincos(z.m>>(1-z.exp), 30).first;
+					for(z.exp=1; z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				else
+					z = f31(0x80000000, 0);
+			}
+			if(L)
+			{
+				if(bsign)
+				{
+					f31 l(0x92868247, 0);
+					if(z.exp < 0)
+					{
+						uint32 m = log2((z.m+1)>>1, 27);
+						z = f31(-((static_cast<uint32>(z.exp)<<26)+(m>>5)), 5);
+						for(; z.m<0x80000000; z.m<<=1,--z.exp) ;
+						l = l + z / lbe;
+					}
+					sign = static_cast<unsigned>(x.exp&&(l.exp<s.exp||(l.exp==s.exp&&l.m<s.m))) << 15;
+					s = sign ? (s-l) : x.exp ? (l-s) : (l+s);
+				}
+				else
+				{
+					sign = static_cast<unsigned>(x.exp==0) << 15;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+					if(s.exp > 15)
+						return overflow<R>(sign);
+				}
+			}
+			else
+			{
+				s = s * lbe;
+				uint32 m;
+				if(s.exp < 0)
+				{
+					m = s.m >> -s.exp;
+					s.exp = 0;
+				}
+				else
+				{
+					m = (s.m<<s.exp) & 0x7FFFFFFF;
+					s.exp = (s.m>>(31-s.exp));
+				}
+				s.m = exp2(m, 27);
+				if(!x.exp)
+					s = f31(0x80000000, 0) / s;
+				if(bsign)
+				{
+					if(z.exp < 0)
+						s = s * z;
+					s = pi / s;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+				}
+				else if(z.exp > 0 && !(z.m&((1<<(31-z.exp))-1)))
+					return ((s.exp+14)<<10) + (s.m>>21);
+				if(s.exp > 15)
+					return overflow<R>(sign);
+			}
+			return fixed2half<R,31,false,false,true>(s.m, s.exp+14, sign);
+		}
+		/// \}
+
+		template<typename,typename,std::float_round_style> struct half_caster;
+	}
+
+	/// Half-precision floating-point type.
+	/// This class implements an IEEE-conformant half-precision floating-point type with the usual arithmetic 
+	/// operators and conversions. It is implicitly convertible to single-precision floating-point, which makes artihmetic 
+	/// expressions and functions with mixed-type operands to be of the most precise operand type.
+	///
+	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and 
+	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which 
+	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the 
+	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of 
+	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most 
+	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit 
+	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if 
+	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on 
+	/// nearly any reasonable platform.
+	///
+	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable 
+	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+	class half
+	{
+	public:
+		/// \name Construction and assignment
+		/// \{
+
+		/// Default constructor.
+		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics 
+		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+		/// Conversion constructor.
+		/// \param rhs float to convert
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		explicit half(float rhs) : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs))) {}
+	
+		/// Conversion to single-precision.
+		/// \return single precision value representing expression value
+		operator float() const { return detail::half2float<float>(data_); }
+
+		/// Assignment operator.
+		/// \param rhs single-precision value to copy from
+		/// \return reference to this half
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		half& operator=(float rhs) { data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs)); return *this; }
+
+		/// \}
+		/// \name Arithmetic updates
+		/// \{
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator+(half,half)
+		half& operator+=(half rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator-(half,half)
+		half& operator-=(half rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator*(half,half)
+		half& operator*=(half rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator/(half,half)
+		half& operator/=(half rhs) { return *this = *this / rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator+=(float rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator-=(float rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator*=(float rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator/=(float rhs) { return *this = *this / rhs; }
+
+		/// \}
+		/// \name Increment and decrement
+		/// \{
+
+		/// Prefix increment.
+		/// \return incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
+
+		/// Prefix decrement.
+		/// \return decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
+
+		/// Postfix increment.
+		/// \return non-incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half operator++(int) { half out(*this); ++*this; return out; }
+
+		/// Postfix decrement.
+		/// \return non-decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half operator--(int) { half out(*this); --*this; return out; }
+		/// \}
+	
+	private:
+		/// Rounding mode to use
+		static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+		/// Constructor.
+		/// \param bits binary representation to set half to
+		HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT : data_(static_cast<detail::uint16>(bits)) {}
+
+		/// Internal binary representation
+		detail::uint16 data_;
+
+	#ifndef HALF_DOXYGEN_ONLY
+		friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
+		friend HALF_CONSTEXPR half operator-(half);
+		friend half operator+(half, half);
+		friend half operator-(half, half);
+		friend half operator*(half, half);
+		friend half operator/(half, half);
+		template<typename charT,typename traits> friend std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits>&, half);
+		template<typename charT,typename traits> friend std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits>&, half&);
+		friend HALF_CONSTEXPR half fabs(half);
+		friend half fmod(half, half);
+		friend half remainder(half, half);
+		friend half remquo(half, half, int*);
+		friend half fma(half, half, half);
+		friend HALF_CONSTEXPR_NOERR half fmax(half, half);
+		friend HALF_CONSTEXPR_NOERR half fmin(half, half);
+		friend half fdim(half, half);
+		friend half nanh(const char*);
+		friend half exp(half);
+		friend half exp2(half);
+		friend half expm1(half);
+		friend half log(half);
+		friend half log10(half);
+		friend half log2(half);
+		friend half log1p(half);
+		friend half sqrt(half);
+		friend half rsqrt(half);
+		friend half cbrt(half);
+		friend half hypot(half, half);
+		friend half hypot(half, half, half);
+		friend half pow(half, half);
+		friend void sincos(half, half*, half*);
+		friend half sin(half);
+		friend half cos(half);
+		friend half tan(half);
+		friend half asin(half);
+		friend half acos(half);
+		friend half atan(half);
+		friend half atan2(half, half);
+		friend half sinh(half);
+		friend half cosh(half);
+		friend half tanh(half);
+		friend half asinh(half);
+		friend half acosh(half);
+		friend half atanh(half);
+		friend half erf(half);
+		friend half erfc(half);
+		friend half lgamma(half);
+		friend half tgamma(half);
+		friend half ceil(half);
+		friend half floor(half);
+		friend half trunc(half);
+		friend half round(half);
+		friend long lround(half);
+		friend half rint(half);
+		friend long lrint(half);
+		friend half nearbyint(half);
+	#ifdef HALF_ENABLE_CPP11_LONG_LONG
+		friend long long llround(half);
+		friend long long llrint(half);
+	#endif
+		friend half frexp(half, int*);
+		friend half scalbln(half, long);
+		friend half modf(half, half*);
+		friend int ilogb(half);
+		friend half logb(half);
+		friend half nextafter(half, half);
+		friend half nexttoward(half, long double);
+		friend HALF_CONSTEXPR half copysign(half, half);
+		friend HALF_CONSTEXPR int fpclassify(half);
+		friend HALF_CONSTEXPR bool isfinite(half);
+		friend HALF_CONSTEXPR bool isinf(half);
+		friend HALF_CONSTEXPR bool isnan(half);
+		friend HALF_CONSTEXPR bool isnormal(half);
+		friend HALF_CONSTEXPR bool signbit(half);
+		friend HALF_CONSTEXPR bool isgreater(half, half);
+		friend HALF_CONSTEXPR bool isgreaterequal(half, half);
+		friend HALF_CONSTEXPR bool isless(half, half);
+		friend HALF_CONSTEXPR bool islessequal(half, half);
+		friend HALF_CONSTEXPR bool islessgreater(half, half);
+		template<typename,typename,std::float_round_style> friend struct detail::half_caster;
+		friend class std::numeric_limits<half>;
+	#if HALF_ENABLE_CPP11_HASH
+		friend struct std::hash<half>;
+	#endif
+	#if HALF_ENABLE_CPP11_USER_LITERALS
+		friend half literal::operator "" _h(long double);
+	#endif
+	#endif
+	};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	namespace literal
+	{
+		/// Half literal.
+		/// While this returns a properly rounded half-precision value, half literals can unfortunately not be constant 
+		/// expressions due to rather involved conversions. So don't expect this to be a literal literal without involving 
+		/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
+		/// \param value literal value
+		/// \return half with of given value (possibly rounded)
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		inline half operator "" _h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
+	}
+#endif
+
+	namespace detail
+	{
+		/// Helper class for half casts.
+		/// This class template has to be specialized for all valid cast arguments to define an appropriate static 
+		/// `cast` member function and a corresponding `type` member denoting its return type.
+		/// \tparam T destination type
+		/// \tparam U source type
+		/// \tparam R rounding mode to use
+		template<typename T,typename U,std::float_round_style R=(std::float_round_style)(HALF_ROUND_STYLE)> struct half_caster {};
+		template<typename U,std::float_round_style R> struct half_caster<half,U,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+		#endif
+
+			static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+		private:
+			static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+			static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+		};
+		template<typename T,std::float_round_style R> struct half_caster<T,half,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+		#endif
+
+			static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+		private:
+			static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+			static T cast_impl(half arg, false_type) { return half2int<R,true,true,T>(arg.data_); }
+		};
+		template<std::float_round_style R> struct half_caster<half,half,R>
+		{
+			static half cast(half arg) { return arg; }
+		};
+	}
+}
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+	/// Numeric limits for half-precision floats.
+	/// **See also:** Documentation for [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
+	template<> class numeric_limits<half_float::half>
+	{
+	public:
+		/// Is template specialization.
+		static HALF_CONSTEXPR_CONST bool is_specialized = true;
+
+		/// Supports signed values.
+		static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+		/// Is not an integer type.
+		static HALF_CONSTEXPR_CONST bool is_integer = false;
+
+		/// Is not exact.
+		static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+		/// Doesn't provide modulo arithmetic.
+		static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+		/// Has a finite set of values.
+		static HALF_CONSTEXPR_CONST bool is_bounded = true;
+
+		/// IEEE conformant.
+		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+		/// Supports infinity.
+		static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+		/// Supports quiet NaNs.
+		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+		/// Supports signaling NaNs.
+		static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
+
+		/// Supports subnormal values.
+		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+		/// Supports no denormalization detection.
+		static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
+
+	#if HALF_ERRHANDLING_THROWS
+		static HALF_CONSTEXPR_CONST bool traps = true;
+	#else
+		/// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+		static HALF_CONSTEXPR_CONST bool traps = false;
+	#endif
+
+		/// Does not support no pre-rounding underflow detection.
+		static HALF_CONSTEXPR_CONST bool tinyness_before = false;
+
+		/// Rounding mode.
+		static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
+
+		/// Significant digits.
+		static HALF_CONSTEXPR_CONST int digits = 11;
+
+		/// Significant decimal digits.
+		static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+		/// Required decimal digits to represent all possible values.
+		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+		/// Number base.
+		static HALF_CONSTEXPR_CONST int radix = 2;
+
+		/// One more than smallest exponent.
+		static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+		/// Smallest normalized representable power of 10.
+		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+		/// One more than largest exponent
+		static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+		/// Largest finitely representable power of 10.
+		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+		/// Smallest positive normal value.
+		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+
+		/// Smallest finite value.
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+
+		/// Largest finite value.
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+
+		/// Difference between 1 and next representable value.
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+
+		/// Maximum rounding error in ULP (units in the last place).
+		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
+
+		/// Positive infinity.
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+
+		/// Quiet NaN.
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+
+		/// Signaling NaN.
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+
+		/// Smallest positive subnormal value.
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+	};
+
+#if HALF_ENABLE_CPP11_HASH
+	/// Hash function for half-precision floats.
+	/// This is only defined if C++11 `std::hash` is supported and enabled.
+	///
+	/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
+	template<> struct hash<half_float::half>
+	{
+		/// Type of function argument.
+		typedef half_float::half argument_type;
+
+		/// Function return type.
+		typedef size_t result_type;
+
+		/// Compute hash function.
+		/// \param arg half to hash
+		/// \return hash value
+		result_type operator()(argument_type arg) const { return hash<half_float::detail::uint16>()(arg.data_&-static_cast<unsigned>(arg.data_!=0x8000)); }
+	};
+#endif
+}
+
+namespace half_float
+{
+	/// \anchor compop
+	/// \name Comparison operators
+	/// \{
+
+	/// Comparison for equality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) && (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for inequality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands not equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
+	{
+		return detail::compsignal(x.data_, y.data_) || (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for less than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for less equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// \}
+	/// \anchor arithmetics
+	/// \name Arithmetic operators
+	/// \{
+
+	/// Identity.
+	/// \param arg operand
+	/// \return unchanged operand
+	inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
+
+	/// Negation.
+	/// \param arg operand
+	/// \return negated operand
+	inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_^0x8000); }
+
+	/// Addition.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return sum of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator+(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)+detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
+		bool sub = ((x.data_^y.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) : (absy!=0x7C00) ? x.data_ :
+										(sub && absx==0x7C00) ? detail::invalid() : y.data_);
+		if(!absx)
+			return absy ? y : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (x.data_|y.data_) : (x.data_&y.data_));
+		if(!absy)
+			return x;
+		unsigned int sign = ((sub && absy>absx) ? y.data_ : x.data_) & 0x8000;
+		if(absy > absx)
+			std::swap(absx, absy);
+		int exp = (absx>>10) + (absx<=0x3FF), d = exp - (absy>>10) - (absy<=0x3FF), mx = ((absx&0x3FF)|((absx>0x3FF)<<10)) << 3, my;
+		if(d < 13)
+		{
+			my = ((absy&0x3FF)|((absy>0x3FF)<<10)) << 3;
+			my = (my>>d) | ((my&((1<<d)-1))!=0);
+		}
+		else
+			my = 1;
+		if(sub)
+		{
+			if(!(mx-=my))
+				return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+			for(; mx<0x2000 && exp>1; mx<<=1,--exp) ;
+		}
+		else
+		{
+			mx += my;
+			int i = mx >> 14;
+			if((exp+=i) > 30)
+				return half(detail::binary, detail::overflow<half::round_style>(sign));
+			mx = (mx>>i) | (mx&i);
+		}
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign+((exp-1)<<10)+(mx>>3), (mx>>2)&1, (mx&0x3)!=0));
+	#endif
+	}
+
+	/// Subtraction.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return difference of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator-(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)-detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		return x + -y;
+	#endif
+	}
+
+	/// Multiplication.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return product of half expressions
+	/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator*(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)*detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										((absx==0x7C00 && !absy)||(absy==0x7C00 && !absx)) ? detail::invalid() : (sign|0x7C00));
+		if(!absx || !absy)
+			return half(detail::binary, sign);
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = m >> 21, s = m & i;
+		exp += (absx>>10) + (absy>>10) + i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,20,false,false,false>(m>>i, exp, sign, s));
+	#endif
+	}
+
+	/// Division.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return quotient of half expressions
+	/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is signaling NaN
+	/// \exception FE_DIVBYZERO if dividing finite value by 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator/(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)/detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==absy) ? detail::invalid() : (sign|((absx==0x7C00) ? 0x7C00 : 0)));
+		if(!absx)
+			return half(detail::binary, absy ? sign : detail::invalid());
+		if(!absy)
+			return half(detail::binary, detail::pole(sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,++exp) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		int i = mx < my;
+		exp += (absx>>10) - (absy>>10) - i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		mx <<= 12 + i;
+		my <<= 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,false>(mx/my, exp, sign, mx%my!=0));
+	#endif
+	}
+
+	/// \}
+	/// \anchor streaming
+	/// \name Input and output
+	/// \{
+
+	/// Output operator.
+	///	This uses the built-in functionality for streaming out floating-point numbers.
+	/// \param out output stream to write into
+	/// \param arg half expression to write
+	/// \return reference to output stream
+	template<typename charT,typename traits> std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits> &out, half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return out << detail::half2float<detail::internal_t>(arg.data_);
+	#else
+		return out << detail::half2float<float>(arg.data_);
+	#endif
+	}
+
+	/// Input operator.
+	///	This uses the built-in functionality for streaming in floating-point numbers, specifically double precision floating 
+	/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the input string is first 
+	/// rounded to double precision using the underlying platform's current floating-point rounding mode before being rounded 
+	/// to half-precision using the library's half-precision rounding mode.
+	/// \param in input stream to read from
+	/// \param arg half to read into
+	/// \return reference to input stream
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename charT,typename traits> std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits> &in, half &arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f;
+	#else
+		double f;
+	#endif
+		if(in >> f)
+			arg.data_ = detail::float2half<half::round_style>(f);
+		return in;
+	}
+
+	/// \}
+	/// \anchor basic
+	/// \name Basic mathematical operations
+	/// \{
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_&0x7FFF); }
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half fmod(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(!absx)
+			return x;
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign|detail::mod<false,false>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remainder(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign^detail::mod<false,true>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param quo address to store some bits of quotient at
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remquo(half x, half y, int *quo)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		bool qsign = ((value^y.data_)&0x8000) != 0;
+		int q = 1;
+		if(absx != absy)
+			value ^= detail::mod<true, true>(absx, absy, &q);
+		return *quo = qsign ? -q : q, half(detail::binary, value);
+	}
+
+	/// Fused multiply add.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param z third operand
+	/// \return ( \a x * \a y ) + \a z rounded as one operation.
+	/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet NaN and no argument is a signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
+	inline half fma(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
+			return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(fx*fy+fz));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		bool sub = ((sign^z.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return	(absx>0x7C00 || absy>0x7C00 || absz>0x7C00) ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_)) :
+					(absx==0x7C00) ? half(detail::binary, (!absy || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) :
+					(absy==0x7C00) ? half(detail::binary, (!absx || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) : z;
+		if(!absx || !absy)
+			return absz ? z : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (z.data_|sign) : (z.data_&sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = m >> 21;
+		exp += (absx>>10) + (absy>>10) + i;
+		m <<= 3 - i;
+		if(absz)
+		{
+			int expz = 0;
+			for(; absz<0x400; absz<<=1,--expz) ;
+			expz += absz >> 10;
+			detail::uint32 mz = static_cast<detail::uint32>((absz&0x3FF)|0x400) << 13;
+			if(expz > exp || (expz == exp && mz > m))
+			{
+				std::swap(m, mz);
+				std::swap(exp, expz);
+				if(sub)
+					sign = z.data_ & 0x8000;
+			}
+			int d = exp - expz;
+			mz = (d<23) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+			if(sub)
+			{
+				m = m - mz;
+				if(!m)
+					return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+				for(; m<0x800000; m<<=1,--exp) ;
+			}
+			else
+			{
+				m += mz;
+				i = m >> 24;
+				m = (m>>i) | (m&i);
+				exp += i;
+			}
+		}
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,23,false,false,false>(m, exp-1, sign));
+	#endif
+	}
+
+	/// Maximum of half expressions.
+	/// **See also:** Documentation for [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return maximum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) < 
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Minimum of half expressions.
+	/// **See also:** Documentation for [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return minimum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) >
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Positive difference.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return \a x - \a y or 0 if difference negative
+	/// \exception FE_... according to operator-(half,half)
+	inline half fdim(half x, half y)
+	{
+		if(isnan(x) || isnan(y))
+			return half(detail::binary, detail::signal(x.data_, y.data_));
+		return (x.data_^(0x8000|(0x8000-(x.data_>>15)))) <= (y.data_^(0x8000|(0x8000-(y.data_>>15)))) ? half(detail::binary, 0) : (x-y);
+	}
+
+	/// Get NaN value.
+	/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
+	/// \param arg string code
+	/// \return quiet NaN
+	inline half nanh(const char *arg)
+	{
+		unsigned int value = 0x7FFF;
+		while(*arg)
+			value ^= static_cast<unsigned>(*arg++) & 0xFF;
+		return half(detail::binary, value);
+	}
+
+	/// \}
+	/// \anchor exponential
+	/// \name Exponential functions
+	/// \{
+
+	/// Exponential function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
+	/// \param arg function argument
+	/// \return e raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4C80)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> (45-e);
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(m, exp, (arg.data_&0x8000)!=0, 0, 26));
+	#endif
+	}
+
+	/// Binary exponential.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
+	/// \param arg function argument
+	/// \return 2 raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp = (abs&0x3FF) + ((abs>0x3FF)<<10);
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4E40)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		return half(detail::binary, detail::exp2_post<half::round_style>(
+			(static_cast<detail::uint32>(exp)<<(6+e))&0x7FFFFFFF, exp>>(25-e), (arg.data_&0x8000)!=0, 0, 28));
+	#endif
+	}
+
+	/// Exponential minus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in <1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
+	/// \param arg function argument
+	/// \return e raised to \a arg and subtracted by 1
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half expm1(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00+(sign>>1)) : detail::signal(arg.data_));
+		if(abs >= 0x4A00)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::rounded<half::round_style,true>(0xBBFF, 1, 1) : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> (45-e);
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		m = detail::exp2(m);
+		if(sign)
+		{
+			int s = 0;
+			if(m > 0x80000000)
+			{
+				++exp;
+				m = detail::divide64(0x80000000, m, s);
+			}
+			m = 0x80000000 - ((m>>exp)|((m&((static_cast<detail::uint32>(1)<<exp)-1))!=0)|s);
+			exp = 0;
+		}
+		else
+			m -= (exp<31) ? (0x80000000>>exp) : 1;
+		for(exp+=14; m<0x80000000 && exp; m<<=1,--exp) ;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::rounded<half::round_style,true>(sign+(exp<<10)+(m>>21), (m>>20)&1, (m&0xFFFFF)!=0));
+	#endif
+	}
+
+	/// Natural logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base e
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 17));
+	#endif
+	}
+
+	/// Common logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 10
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log10(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log10(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		switch(abs)
+		{
+			case 0x4900: return half(detail::binary, 0x3C00);
+			case 0x5640: return half(detail::binary, 0x4000);
+			case 0x63D0: return half(detail::binary, 0x4200);
+			case 0x70E2: return half(detail::binary, 0x4400);
+		}
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xD49A784C>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 16));
+	#endif
+	}
+
+	/// Binary logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 2
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += (abs>>10);
+		if(!(abs&0x3FF))
+		{
+			unsigned int value = static_cast<unsigned>(exp<0) << 15, m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			return half(detail::binary, value+(exp<<10)+m);
+		}
+		detail::uint32 ilog = exp, sign = detail::sign_mask(ilog), m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 28)>>4))^sign) - sign;
+		if(!m)
+			return half(detail::binary, 0);
+		for(exp=14; m<0x8000000 && exp; m<<=1,--exp) ;
+		for(; m>0xFFFFFFF; m>>=1,++exp)
+			s |= m & 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,27,false,false,true>(m, exp, sign&0x8000, s));
+	#endif
+	}
+
+	/// Natural logarithm plus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in ~1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
+	/// \param arg function argument
+	/// \return logarithm of \a arg plus 1 to base e
+	/// \exception FE_INVALID for signaling NaN or argument <-1
+	/// \exception FE_DIVBYZERO for -1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log1p(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		if(arg.data_ >= 0xBC00)
+			return half(detail::binary, (arg.data_==0xBC00) ? detail::pole(0x8000) : (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 20;
+		if(arg.data_ & 0x8000)
+		{
+			m = 0x40000000 - (m>>-exp);
+			for(exp=0; m<0x40000000; m<<=1,--exp) ;
+		}
+		else
+		{
+			if(exp < 0)
+			{
+				m = 0x40000000 + (m>>-exp);
+				exp = 0;
+			}
+			else
+			{
+				m += 0x40000000 >> exp;
+				int i = m >> 31;
+				m >>= i;
+				exp += i;
+			}
+		}
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(m), exp, 17));
+	#endif
+	}
+
+	/// \}
+	/// \anchor power
+	/// \name Power functions
+	/// \{
+
+	/// Square root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
+	/// \param arg function argument
+	/// \return square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half sqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 15;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ? detail::invalid() : arg.data_);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		detail::uint32 r = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 10, m = detail::sqrt<20>(r, exp+=abs>>10);
+		return half(detail::binary, detail::rounded<half::round_style,false>((exp<<10)+(m&0x3FF), r>m, r!=0));
+	#endif
+	}
+
+	/// Inverse square root.
+	/// This function is exact to rounding for all rounding modes and thus generally more accurate than directly computing 
+	/// 1 / sqrt(\a arg) in half-precision, in addition to also being faster.
+	/// \param arg function argument
+	/// \return reciprocal of square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half rsqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::internal_t(1)/std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, bias = 0x4000;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ?
+										detail::invalid() : !abs ? detail::pole(arg.data_&0x8000) : 0);
+		for(; abs<0x400; abs<<=1,bias-=0x400) ;
+		unsigned int frac = (abs+=bias) & 0x7FF;
+		if(frac == 0x400)
+			return half(detail::binary, 0x7A00-(abs>>1));
+		if((half::round_style == std::round_to_nearest && (frac == 0x3FE || frac == 0x76C)) ||
+		   (half::round_style != std::round_to_nearest && (frac == 0x15A || frac == 0x3FC || frac == 0x401 || frac == 0x402 || frac == 0x67B)))
+			return pow(arg, half(detail::binary, 0xB800));
+		detail::uint32 f = 0x17376 - abs, mx = (abs&0x3FF) | 0x400, my = ((f>>1)&0x3FF) | 0x400, mz = my * my;
+		int expy = (f>>11) - 31, expx = 32 - (abs>>10), i = mz >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=mz>>10) >> 31;
+		expy += i;
+		my = (my>>(20+i)) + 1;
+		i = (mz=my*my) >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=(mz>>10)+1) >> 31;
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,false,true>(my>>i, expy+i+14));
+	#endif
+	}
+
+	/// Cubic root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
+	/// \param arg function argument
+	/// \return cubic root of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT according to rounding
+	inline half cbrt(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs == 0x3C00 || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1, --exp);
+		detail::uint32 ilog = exp + (abs>>10), sign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 24)>>4))^sign) - sign;
+		for(exp=2; m<0x80000000; m<<=1,--exp) ;
+		m = detail::multiply64(m, 0xAAAAAAAB);
+		int i = m >> 31, s;
+		exp += i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = m >> (31-exp);
+		}
+		m = detail::exp2(f, (half::round_style==std::round_to_nearest) ? 29 : 26);
+		if(sign)
+		{
+			if(m > 0x80000000)
+			{
+				m = detail::divide64(0x80000000, m, s);
+				++exp;
+			}
+			exp = -exp;
+		}
+		return half(detail::binary, (half::round_style==std::round_to_nearest) ?
+			detail::fixed2half<half::round_style,31,false,false,false>(m, exp+14, arg.data_&0x8000) :
+			detail::fixed2half<half::round_style,23,false,false,false>((m+0x80)>>8, exp+14, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_);
+		#if HALF_ENABLE_CPP11_CMATH
+			return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy)));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, y.data_) :
+				(absy==0x7C00) ? detail::select(0x7C00, x.data_) : detail::signal(x.data_, y.data_));
+		if(!absx)
+			return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
+		if(!absy)
+			return half(detail::binary, detail::check_underflow(absx));
+		if(absy > absx)
+			std::swap(absx, absy);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		int ix = mx >> 21, iy = my >> 21;
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		int d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \param z third argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy+fz*fz)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0, expy = 0, expz = 0;
+		if(!absx)
+			return hypot(y, z);
+		if(!absy)
+			return hypot(x, z);
+		if(!absz)
+			return hypot(x, y);
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, detail::select(y.data_, z.data_)) :
+										(absy==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, z.data_)) :
+										(absz==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, y.data_)) :
+										detail::signal(x.data_, y.data_, z.data_));
+		if(absz > absy)
+			std::swap(absy, absz);
+		if(absy > absx)
+			std::swap(absx, absy);
+		if(absz > absy)
+			std::swap(absy, absz);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		for(; absz<0x400; absz<<=1,--expz) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400, mz = (absz&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		mz *= mz;
+		int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		expz = 2*(expz+(absz>>10)) - 15 + iz;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		mz <<= 10 - iz;
+		int d = expy - expz;
+		mz = (d<30) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		my += mz;
+		if(my & 0x80000000)
+		{
+			my = (my>>1) | (my&1);
+			if(++expy > expx)
+			{
+				std::swap(mx, my);
+				std::swap(expx, expy);
+			}
+		}
+		d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Power function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.00025% of inputs.
+	///
+	/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
+	/// \param x base
+	/// \param y exponent
+	/// \return \a x raised to \a y
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y is finite and not integral
+	/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half pow(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::pow(detail::half2float<detail::internal_t>(x.data_), detail::half2float<detail::internal_t>(y.data_))));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
+		if(!absy || x.data_ == 0x3C00)
+			return half(detail::binary, detail::select(0x3C00, (x.data_==0x3C00) ? y.data_ : x.data_));
+		bool is_int = absy >= 0x6400 || (absy>=0x3C00 && !(absy&((1<<(25-(absy>>10)))-1)));
+		unsigned int sign = x.data_ & (static_cast<unsigned>((absy<0x6800)&&is_int&&((absy>>(25-(absy>>10)))&1))<<15);
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absy==0x7C00) ? ((absx==0x3C00) ? 0x3C00 : (!absx && y.data_==0xFC00) ? detail::pole() :
+										(0x7C00&-((y.data_>>15)^(absx>0x3C00)))) : (sign|(0x7C00&((y.data_>>15)-1U))));
+		if(!absx)
+			return half(detail::binary, (y.data_&0x8000) ? detail::pole(sign) : sign);
+		if((x.data_&0x8000) && !is_int)
+			return half(detail::binary, detail::invalid());
+		if(x.data_ == 0xBC00)
+			return half(detail::binary, sign|0x3C00);
+		switch(y.data_)
+		{
+			case 0x3800: return sqrt(x);
+			case 0x3C00: return half(detail::binary, detail::check_underflow(x.data_));
+			case 0x4000: return x * x;
+			case 0xBC00: return half(detail::binary, 0x3C00) / x;
+		}
+		for(; absx<0x400; absx<<=1,--exp) ;
+		detail::uint32 ilog = exp + (absx>>10), msign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+((detail::log2(static_cast<detail::uint32>((absx&0x3FF)|0x400)<<20)+8)>>4))^msign) - msign;
+		for(exp=-11; m<0x80000000; m<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		m = detail::multiply64(m, static_cast<detail::uint32>((absy&0x3FF)|0x400)<<21);
+		int i = m >> 31;
+		exp += (absy>>10) + i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = m >> (31-exp);
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(f, exp, ((msign&1)^(y.data_>>15))!=0, sign));
+	#endif
+	}
+
+	/// \}
+	/// \anchor trigonometric
+	/// \name Trigonometric functions
+	/// \{
+
+	/// Compute sine and cosine simultaneously.
+	///	This returns the same results as sin() and cos() but is faster than calling each function individually.
+	///
+	/// This function is exact to rounding for all rounding modes.
+	/// \param arg function argument
+	/// \param sin variable to take sine of \a arg
+	/// \param cos variable to take cosine of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline void sincos(half arg, half *sin, half *cos)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
+		*sin = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
+		*cos = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
+	#else
+		int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
+		if(abs >= 0x7C00)
+			*sin = *cos = half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		else if(!abs)
+		{
+			*sin = arg;
+			*cos = half(detail::binary, 0x3C00);
+		}
+		else if(abs < 0x2500)
+		{
+			*sin = half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+			*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		}
+		else
+		{
+			if(half::round_style != std::round_to_nearest)
+			{
+				switch(abs)
+				{
+				case 0x48B7:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0xBBFF, 1, 1));
+					return;
+				case 0x598C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+					return;
+				case 0x6A64:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x27FF, 1, 1));
+					return;
+				case 0x6D8C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+					return;
+				}
+			}
+			std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+			switch(k & 3)
+			{
+				case 1: sc = std::make_pair(sc.second, -sc.first); break;
+				case 2: sc = std::make_pair(-sc.first, -sc.second); break;
+				case 3: sc = std::make_pair(-sc.second, sc.first); break;
+			}
+			*sin = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((sc.first^-static_cast<detail::uint32>(sign))+sign));
+			*cos = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>(sc.second));
+		}
+	#endif
+	}
+
+	/// Sine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
+	/// \param arg function argument
+	/// \return sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x48B7: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+				case 0x6A64: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+				case 0x6D8C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)&1)^(arg.data_>>15));
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.second : sc.first)^sign) - sign));
+	#endif
+	}
+
+	/// Cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
+	/// \param arg function argument
+	/// \return cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2500)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x598C)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)^k)&1);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.first : sc.second)^sign) - sign));
+	#endif
+	}
+
+	/// Tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
+	/// \param arg function argument
+	/// \return tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 13, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x658C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x07E6, 1, 1));
+				case 0x7330: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x4B62, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
+		if(k & 1)
+			sc = std::make_pair(-sc.second, sc.first);
+		detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
+		detail::uint32 my = (sc.first^signy) - signy, mx = (sc.second^signx) - signx;
+		for(; my<0x80000000; my<<=1,--exp) ;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp, (signy^signx^arg.data_)&0x8000));
+	#endif
+	}
+
+	/// Arc sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
+	/// \param arg function argument
+	/// \return arc sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::asin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_+1, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(sc.first, sc.second, (half::round_style==std::round_to_nearest) ? 27 : 26);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
+	/// \param arg function argument
+	/// \return arc cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::acos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
+		if(!abs)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3E48, 0, 1));
+		if(abs >= 0x3C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										sign ? detail::rounded<half::round_style,true>(0x4248, 0, 1) : 0);
+		std::pair<detail::uint32,detail::uint32> cs = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(cs.second, cs.first, 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(sign ? (0xC90FDAA2-m) : m, 15, 0, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
+	/// \param arg function argument
+	/// \return arc tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1) : detail::signal(arg.data_));
+		if(abs <= 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		int exp = (abs>>10) + (abs<=0x3FF);
+		detail::uint32 my = (abs&0x3FF) | ((abs>0x3FF)<<10);
+		detail::uint32 m = (exp>15) ?	detail::atan2(my<<19, 0x20000000>>(exp-15), (half::round_style==std::round_to_nearest) ? 26 : 24) :
+										detail::atan2(my<<(exp+4), 0x20000000, (half::round_style==std::round_to_nearest) ? 30 : 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for `std::round_to_nearest`, 
+	/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
+	/// \param y numerator
+	/// \param x denominator
+	/// \return arc tangent value
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan2(half y, half x)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan2(detail::half2float<detail::internal_t>(y.data_), detail::half2float<detail::internal_t>(x.data_))));
+	#else
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15, signy = y.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+		{
+			if(absx > 0x7C00 || absy > 0x7C00)
+				return half(detail::binary, detail::signal(x.data_, y.data_));
+			if(absy == 0x7C00)
+				return half(detail::binary, (absx<0x7C00) ?	detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1) :
+													signx ?	detail::rounded<half::round_style,true>(signy|0x40B6, 0, 1) :
+															detail::rounded<half::round_style,true>(signy|0x3A48, 0, 1));
+			return (x.data_==0x7C00) ? half(detail::binary, signy) : half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		}
+		if(!absy)
+			return signx ? half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1)) : y;
+		if(!absx)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		int d = (absy>>10) + (absy<=0x3FF) - (absx>>10) - (absx<=0x3FF);
+		if(d > (signx ? 18 : 12))
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		if(signx && d < -11)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		if(!signx && d < ((half::round_style==std::round_toward_zero) ? -15 : -9))
+		{
+			for(; absy<0x400; absy<<=1,--d) ;
+			detail::uint32 mx = ((absx<<1)&0x7FF) | 0x800, my = ((absy<<1)&0x7FF) | 0x800;
+			int i = my < mx;
+			d -= i;
+			if(d < -25)
+				return half(detail::binary, detail::underflow<half::round_style>(signy));
+			my <<= 11 + i;
+			return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,true>(my/mx, d+14, signy, my%mx!=0));
+		}
+		detail::uint32 m = detail::atan2(	((absy&0x3FF)|((absy>0x3FF)<<10))<<(19+((d<0) ? d : (d>0) ? 0 : -1)),
+											((absx&0x3FF)|((absx>0x3FF)<<10))<<(19-((d>0) ? d : (d<0) ? 0 : 1)));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(signx ? (0xC90FDAA2-m) : m, 15, signy, signx));
+	#endif
+	}
+
+	/// \}
+	/// \anchor hyperbolic
+	/// \name Hyperbolic functions
+	/// \{
+
+	/// Hyperbolic sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
+	/// \param arg function argument
+	/// \return hyperbolic sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sinh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 29 : 27);
+		detail::uint32 m = mm.first - mm.second;
+		for(exp+=13; m<0x80000000 && exp; m<<=1,--exp) ;
+		unsigned int sign = arg.data_ & 0x8000;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp, sign));
+	#endif
+	}
+
+	/// Hyperbolic cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
+	/// \param arg function argument
+	/// \return hyperbolic cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cosh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : 0x7C00);
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 23 : 26);
+		detail::uint32 m = mm.first + mm.second, i = (~m&0xFFFFFFFF) >> 31;
+		m = (m>>i) | (m&i) | 0x80000000;
+		if((exp+=13+i) > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp));
+	#endif
+	}
+
+	/// Hyperbolic tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
+	/// \param arg function argument
+	/// \return hyperbolic tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tanh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_-0x4000));
+		if(abs >= 0x4500)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-3, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
+		detail::uint32 my = mm.first - mm.second - (half::round_style!=std::round_to_nearest), mx = mm.first + mm.second, i = (~mx&0xFFFFFFFF) >> 31;
+		for(exp=13; my<0x80000000; my<<=1,--exp) ;
+		mx = (mx>>i) | 0x80000000;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp-i, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hyperbolic area sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
+	/// \param arg function argument
+	/// \return area sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asinh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x32D4: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-13, 1, 1));
+				case 0x3B5B: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-197, 1, 1));
+			}
+		return half(detail::binary, detail::area<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
+	/// \param arg function argument
+	/// \return area cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or arguments <1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acosh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if((arg.data_&0x8000) || abs < 0x3C00)
+			return half(detail::binary, (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		if(arg.data_ >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		return half(detail::binary, detail::area<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
+	/// \param arg function argument
+	/// \return area tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_DIVBYZERO for +/-1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atanh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 0;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs==0x3C00) ? detail::pole(arg.data_&0x8000) : (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << ((abs>>10)+(abs<=0x3FF)+6), my = 0x80000000 + m, mx = 0x80000000 - m;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		int i = my >= mx, s;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(
+			(detail::divide64(my>>i, mx, s)+1)>>1, 27)+0x10, exp+i-1, 16, arg.data_&0x8000));
+	#endif
+	}
+
+	/// \}
+	/// \anchor special
+	/// \name Error and gamma functions
+	/// \{
+
+	/// Error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
+	/// \param arg function argument
+	/// \return error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erf(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erf(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (arg.data_-0x4000) : detail::signal(arg.data_)) : arg;
+		if(abs >= 0x4200)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		return half(detail::binary, detail::erf<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Complementary error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
+	/// \param arg function argument
+	/// \return 1 minus error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erfc(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (sign>>1) : detail::signal(arg.data_)) : arg;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x4400)
+			return half(detail::binary, detail::rounded<half::round_style,true>((sign>>1)-(sign>>15), sign>>15, 1));
+		return half(detail::binary, detail::erf<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Natural logarithm of gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.025% of inputs.
+	///
+	/// **See also:** Documentation for [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
+	/// \param arg function argument
+	/// \return natural logarith of gamma function for \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0 or negative integer arguments
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half lgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		if(!abs || arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::pole());
+		if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
+			return half(detail::binary, 0);
+		return half(detail::binary, detail::gamma<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.25% of inputs.
+	///
+	/// **See also:** Documentation for [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
+	/// \param arg function argument
+	/// \return gamma function value of \a arg
+	/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs)
+			return half(detail::binary, detail::pole(arg.data_));
+		if(abs >= 0x7C00)
+			return (arg.data_==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::invalid());
+		if(arg.data_ >= 0xCA80)
+			return half(detail::binary, detail::underflow<half::round_style>((1-((abs>>(25-(abs>>10)))&1))<<15));
+		if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
+			return half(detail::binary, detail::overflow<half::round_style>());
+		if(arg.data_ == 0x3C00)
+			return arg;
+		return half(detail::binary, detail::gamma<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// \}
+	/// \anchor rounding
+	/// \name Rounding
+	/// \{
+
+	/// Nearest integer not less than half value.
+	/// **See also:** Documentation for [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
+	/// \param arg half to round
+	/// \return nearest integer not less than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half ceil(half arg) { return half(detail::binary, detail::integral<std::round_toward_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater than half value.
+	/// **See also:** Documentation for [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
+	/// \param arg half to round
+	/// \return nearest integer not greater than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half floor(half arg) { return half(detail::binary, detail::integral<std::round_toward_neg_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater in magnitude than half value.
+	/// **See also:** Documentation for [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
+	/// \param arg half to round
+	/// \return nearest integer not greater in magnitude than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half trunc(half arg) { return half(detail::binary, detail::integral<std::round_toward_zero,true,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half round(half arg) { return half(detail::binary, detail::integral<std::round_to_nearest,false,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long`
+	inline long lround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half rint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,true>(arg.data_)); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long lrint(half arg) { return detail::half2int<half::round_style,true,true,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	inline half nearbyint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,false>(arg.data_)); }
+#if HALF_ENABLE_CPP11_LONG_LONG
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long long`
+	inline long long llround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long long llrint(half arg) { return detail::half2int<half::round_style,true,true,long long>(arg.data_); }
+#endif
+
+	/// \}
+	/// \anchor float
+	/// \name Floating point manipulation
+	/// \{
+
+	/// Decompress floating-point number.
+	/// **See also:** Documentation for [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
+	/// \param arg number to decompress
+	/// \param exp address to store exponent at
+	/// \return significant in range [0.5, 1)
+	/// \exception FE_INVALID for signaling NaN
+	inline half frexp(half arg, int *exp)
+	{
+		*exp = 0;
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--*exp) ;
+		*exp += (abs>>10) - 14;
+		return half(detail::binary, (arg.data_&0x8000)|0x3800|(abs&0x3FF));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbln(half arg, long exp)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		else if(exp > 0)
+			return half(detail::binary, sign|(exp<<10)|(abs&0x3FF));
+		unsigned int m = (abs&0x3FF) | 0x400;
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign|(m>>(1-exp)), (m>>-exp)&1, (m&((1<<-exp)-1))!=0));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Extract integer and fractional parts.
+	/// **See also:** Documentation for [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
+	/// \param arg number to decompress
+	/// \param iptr address to store integer part at
+	/// \return fractional part
+	/// \exception FE_INVALID for signaling NaN
+	inline half modf(half arg, half *iptr)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs > 0x7C00)
+		{
+			arg = half(detail::binary, detail::signal(arg.data_));
+			return *iptr = arg, arg;
+		}
+		if(abs >= 0x6400)
+			return *iptr = arg, half(detail::binary, arg.data_&0x8000);
+		if(abs < 0x3C00)
+			return iptr->data_ = arg.data_ & 0x8000, arg;
+		unsigned int exp = abs >> 10, mask = (1<<(25-exp)) - 1, m = arg.data_ & mask;
+		iptr->data_ = arg.data_ & ~mask;
+		if(!m)
+			return half(detail::binary, arg.data_&0x8000);
+		for(; m<0x400; m<<=1,--exp) ;
+		return half(detail::binary, (arg.data_&0x8000)|(exp<<10)|(m&0x3FF));
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \retval FP_ILOGB0 for zero
+	/// \retval FP_ILOGBNAN for NaN
+	/// \retval INT_MAX for infinity
+	/// \exception FE_INVALID for 0 or infinite values
+	inline int ilogb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+		{
+			detail::raise(FE_INVALID);
+			return !abs ? FP_ILOGB0 : (abs==0x7C00) ? INT_MAX : FP_ILOGBNAN;
+		}
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		return exp;
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0
+	inline half logb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		unsigned int value = static_cast<unsigned>(exp<0) << 15;
+		if(exp)
+		{
+			unsigned int m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			value |= (exp<<10) + m;
+		}
+		return half(detail::binary, value);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nextafter(half from, half to)
+	{
+		int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+		if(fabs > 0x7C00 || tabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_, to.data_));
+		if(from.data_ == to.data_ || !(fabs|tabs))
+			return to;
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (to.data_&0x8000)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(
+			(from.data_^(0x8000|(0x8000-(from.data_>>15))))<(to.data_^(0x8000|(0x8000-(to.data_>>15))))))<<1) - 1;
+		detail::raise(FE_OVERFLOW, fabs<0x7C00 && (out&0x7C00)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7C00)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nexttoward(half from, long double to)
+	{
+		int fabs = from.data_ & 0x7FFF;
+		if(fabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_));
+		long double lfrom = static_cast<long double>(from);
+		if(detail::builtin_isnan(to) || lfrom == to)
+			return half(static_cast<float>(to));
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to))<<15)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(lfrom<to))<<1) - 1;
+		detail::raise(FE_OVERFLOW, (out&0x7FFF)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7FFF)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Take sign.
+	/// **See also:** Documentation for [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
+	/// \param x value to change sign for
+	/// \param y value to take sign from
+	/// \return value equal to \a x in magnitude and to \a y in sign
+	inline HALF_CONSTEXPR half copysign(half x, half y) { return half(detail::binary, x.data_^((x.data_^y.data_)&0x8000)); }
+
+	/// \}
+	/// \anchor classification
+	/// \name Floating point classification
+	/// \{
+
+	/// Classify floating-point value.
+	/// **See also:** Documentation for [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
+	/// \param arg number to classify
+	/// \retval FP_ZERO for positive and negative zero
+	/// \retval FP_SUBNORMAL for subnormal numbers
+	/// \retval FP_INFINITY for positive and negative infinity
+	/// \retval FP_NAN for NaNs
+	/// \retval FP_NORMAL for all other (normal) values
+	inline HALF_CONSTEXPR int fpclassify(half arg)
+	{
+		return	!(arg.data_&0x7FFF) ? FP_ZERO :
+				((arg.data_&0x7FFF)<0x400) ? FP_SUBNORMAL :
+				((arg.data_&0x7FFF)<0x7C00) ? FP_NORMAL :
+				((arg.data_&0x7FFF)==0x7C00) ? FP_INFINITE :
+				FP_NAN;
+	}
+
+	/// Check if finite number.
+	/// **See also:** Documentation for [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
+	/// \param arg number to check
+	/// \retval true if neither infinity nor NaN
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; }
+
+	/// Check for infinity.
+	/// **See also:** Documentation for [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
+	/// \param arg number to check
+	/// \retval true for positive or negative infinity
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; }
+
+	/// Check for NaN.
+	/// **See also:** Documentation for [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
+	/// \param arg number to check
+	/// \retval true for NaNs
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; }
+
+	/// Check if normal number.
+	/// **See also:** Documentation for [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
+	/// \param arg number to check
+	/// \retval true if normal number
+	/// \retval false if either subnormal, zero, infinity or NaN
+	inline HALF_CONSTEXPR bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); }
+
+	/// Check sign.
+	/// **See also:** Documentation for [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
+	/// \param arg number to check
+	/// \retval true for negative number
+	/// \retval false for positive number
+	inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_&0x8000) != 0; }
+
+	/// \}
+	/// \anchor compfunc
+	/// \name Comparison
+	/// \{
+
+	/// Quiet comparison for greater than.
+	/// **See also:** Documentation for [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreater(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for greater equal.
+	/// **See also:** Documentation for [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less than.
+	/// **See also:** Documentation for [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isless(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less equal.
+	/// **See also:** Documentation for [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comarison for less or greater.
+	/// **See also:** Documentation for [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if either less or greater
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessgreater(half x, half y)
+	{
+		return x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet check if unordered.
+	/// **See also:** Documentation for [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if unordered (one or two NaN operands)
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+	/// \}
+	/// \anchor casting
+	/// \name Casting
+	/// \{
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the default rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,typename U> T half_cast(U arg) { return detail::half_caster<T,U>::cast(arg); }
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the specified rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam R rounding mode to use.
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,std::float_round_style R,typename U> T half_cast(U arg) { return detail::half_caster<T,U,R>::cast(arg); }
+	/// \}
+
+	/// \}
+	/// \anchor errors
+	/// \name Error handling
+	/// \{
+
+	/// Clear exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
+	/// \param excepts OR of exceptions to clear
+	/// \retval 0 all selected flags cleared successfully
+	inline int feclearexcept(int excepts) { detail::errflags() &= ~excepts; return 0; }
+
+	/// Test exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
+	/// \param excepts OR of exceptions to test
+	/// \return OR of selected exceptions if raised
+	inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
+
+	/// Raise exception flags.
+	/// This raises the specified floating point exceptions and also invokes any additional automatic exception handling as 
+	/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
+	/// \param excepts OR of exceptions to raise
+	/// \retval 0 all selected exceptions raised successfully
+	inline int feraiseexcept(int excepts) { detail::errflags() |= excepts; detail::raise(excepts); return 0; }
+
+	/// Save exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to store flag state at
+	/// \param excepts OR of flags to save
+	/// \retval 0 for success
+	inline int fegetexceptflag(int *flagp, int excepts) { *flagp = detail::errflags() & excepts; return 0; }
+
+	/// Restore exception flags.
+	/// This only copies the specified exception state (including unset flags) without incurring any additional exception handling.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to take flag state from
+	/// \param excepts OR of flags to restore
+	/// \retval 0 for success
+	inline int fesetexceptflag(const int *flagp, int excepts) { detail::errflags() = (detail::errflags()|(*flagp&excepts)) & (*flagp|~excepts); return 0; }
+
+	/// Throw C++ exceptions based on set exception flags.
+	/// This function manually throws a corresponding C++ exception if one of the specified flags is set, 
+	/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	/// \param excepts OR of exceptions to test
+	/// \param msg error message to use for exception description
+	/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
+	/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
+	/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
+	/// \throw std::range_error if `FE_INEXACT` is selected and set
+	inline void fethrowexcept(int excepts, const char *msg = "")
+	{
+		excepts &= detail::errflags();
+		if(excepts & (FE_INVALID|FE_DIVBYZERO))
+			throw std::domain_error(msg);
+		if(excepts & FE_OVERFLOW)
+			throw std::overflow_error(msg);
+		if(excepts & FE_UNDERFLOW)
+			throw std::underflow_error(msg);
+		if(excepts & FE_INEXACT)
+			throw std::range_error(msg);
+	}
+	/// \}
+}
+
+
+#undef HALF_UNUSED_NOERR
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_CONSTEXPR_NOERR
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#undef HALF_THREAD_LOCAL
+#undef HALF_TWOS_COMPLEMENT_INT
+#ifdef HALF_POP_WARNINGS
+	#pragma warning(pop)
+	#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.cpp b/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.cpp
new file mode 100644
index 000000000..43af8e73c
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.cpp
@@ -0,0 +1,6234 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#include "lodepng.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20170917";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size)
+{
+    return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size)
+{
+    return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr)
+{
+    free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code)\
+{\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code)\
+{\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call)\
+{\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code)\
+{\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector
+{
+    unsigned* data;
+    size_t size; /*size in number of unsigned longs*/
+    size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p)
+{
+    ((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+    lodepng_free(((uivector*)p)->data);
+    ((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size)
+{
+    if (!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value)
+{
+    size_t oldsize = p->size, i;
+    if (!uivector_resize(p, size)) return 0;
+    for (i = oldsize; i < size; ++i) p->data[i] = value;
+    return 1;
+}
+
+static void uivector_init(uivector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c)
+{
+    if (!uivector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector
+{
+    unsigned char* data;
+    size_t size; /*used size*/
+    size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize)
+{
+    if (allocsize > p->allocsize)
+    {
+        size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+        void* data = lodepng_realloc(p->data, newsize);
+        if (data)
+        {
+            p->allocsize = newsize;
+            p->data = (unsigned char*)data;
+        }
+        else return 0; /*error: not enough memory*/
+    }
+    return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size)
+{
+    if (!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+    p->size = size;
+    return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p)
+{
+    ((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+    lodepng_free(((ucvector*)p)->data);
+    ((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p)
+{
+    p->data = NULL;
+    p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size)
+{
+    p->data = buffer;
+    p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c)
+{
+    if (!ucvector_resize(p, p->size + 1)) return 0;
+    p->data[p->size - 1] = c;
+    return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned string_resize(char** out, size_t size)
+{
+    char* data = (char*)lodepng_realloc(*out, size + 1);
+    if (data)
+    {
+        data[size] = 0; /*null termination char*/
+        *out = data;
+    }
+    return data != 0;
+}
+
+/*init a {char*, size_t} pair for use as string*/
+static void string_init(char** out)
+{
+    *out = NULL;
+    string_resize(out, 0);
+}
+
+/*free the above pair again*/
+static void string_cleanup(char** out)
+{
+    lodepng_free(*out);
+    *out = NULL;
+}
+
+static void string_set(char** out, const char* in)
+{
+    size_t insize = strlen(in), i;
+    if (string_resize(out, insize))
+    {
+        for (i = 0; i != insize; ++i)
+        {
+            (*out)[i] = in[i];
+        }
+    }
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer)
+{
+    return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value)
+{
+    buffer[0] = (unsigned char)((value >> 24) & 0xff);
+    buffer[1] = (unsigned char)((value >> 16) & 0xff);
+    buffer[2] = (unsigned char)((value >> 8) & 0xff);
+    buffer[3] = (unsigned char)((value) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value)
+{
+    ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+    lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename)
+{
+    FILE* file;
+    long size;
+    file = fopen(filename, "rb");
+    if (!file) return -1;
+
+    if (fseek(file, 0, SEEK_END) != 0)
+    {
+        fclose(file);
+        return -1;
+    }
+
+    size = ftell(file);
+    /* It may give LONG_MAX as directory size, this is invalid for us. */
+    if (size == LONG_MAX) size = -1;
+
+    fclose(file);
+    return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename)
+{
+    FILE* file;
+    size_t readsize;
+    file = fopen(filename, "rb");
+    if (!file) return 78;
+
+    readsize = fread(out, 1, size, file);
+    fclose(file);
+
+    if (readsize != size) return 78;
+    return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename)
+{
+    long size = lodepng_filesize(filename);
+    if (size < 0) return 78;
+    *outsize = (size_t)size;
+
+    *out = (unsigned char*)lodepng_malloc((size_t)size);
+    if (!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+    return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename)
+{
+    FILE* file;
+    file = fopen(filename, "wb");
+    if (!file) return 79;
+    fwrite((char*)buffer, 1, buffersize, file);
+    fclose(file);
+    return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit)\
+{\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+    size_t i;
+    for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0, i;
+    for (i = 0; i != nbits; ++i)
+    {
+        result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+        ++(*bitpointer);
+    }
+    return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+= { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+67, 83, 99, 115, 131, 163, 195, 227, 258 };
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+= { 0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+4,  4,  4,   4,   5,   5,   5,   5,   0 };
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+= { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577 };
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+= { 0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13 };
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+= { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree
+{
+    unsigned* tree2d;
+    unsigned* tree1d;
+    unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+    unsigned maxbitlen; /*maximum number of bits a single code can get*/
+    unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree)
+{
+std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+for(size_t i = 0; i != tree->tree1d.size; ++i)
+{
+if(tree->lengths.data[i])
+std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+}
+std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree)
+{
+    tree->tree2d = 0;
+    tree->tree1d = 0;
+    tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree)
+{
+    lodepng_free(tree->tree2d);
+    lodepng_free(tree->tree1d);
+    lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree)
+{
+    unsigned nodefilled = 0; /*up to which node it is filled*/
+    unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+    unsigned n, i;
+
+    tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+    if (!tree->tree2d) return 83; /*alloc fail*/
+
+                                  /*
+                                  convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+                                  uninited, a value >= numcodes is an address to another bit, a value < numcodes
+                                  is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+                                  many columns as codes - 1.
+                                  A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+                                  Here, the internal nodes are stored (what their 0 and 1 option point to).
+                                  There is only memory for such good tree currently, if there are more nodes
+                                  (due to too long length codes), error 55 will happen
+                                  */
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+    }
+
+    for (n = 0; n < tree->numcodes; ++n) /*the codes*/
+    {
+        for (i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/
+        {
+            unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+            /*oversubscribed, see comment in lodepng_error_text*/
+            if (treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+            if (tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/
+            {
+                if (i + 1 == tree->lengths[n]) /*last bit*/
+                {
+                    tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+                    treepos = 0;
+                }
+                else
+                {
+                    /*put address of the next step in here, first that address has to be found of course
+                    (it's just nodefilled + 1)...*/
+                    ++nodefilled;
+                    /*addresses encoded with numcodes added to it*/
+                    tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+                    treepos = nodefilled;
+                }
+            }
+            else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+        }
+    }
+
+    for (n = 0; n < tree->numcodes * 2; ++n)
+    {
+        if (tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+    }
+
+    return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree)
+{
+    uivector blcount;
+    uivector nextcode;
+    unsigned error = 0;
+    unsigned bits, n;
+
+    uivector_init(&blcount);
+    uivector_init(&nextcode);
+
+    tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+    if (!tree->tree1d) error = 83; /*alloc fail*/
+
+    if (!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+        || !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+        error = 83; /*alloc fail*/
+
+    if (!error)
+    {
+        /*step 1: count number of instances of each code length*/
+        for (bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+        /*step 2: generate the nextcode values*/
+        for (bits = 1; bits <= tree->maxbitlen; ++bits)
+        {
+            nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+        }
+        /*step 3: generate all the codes*/
+        for (n = 0; n != tree->numcodes; ++n)
+        {
+            if (tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+        }
+    }
+
+    uivector_cleanup(&blcount);
+    uivector_cleanup(&nextcode);
+
+    if (!error) return HuffmanTree_make2DTree(tree);
+    else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned i;
+    tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+    for (i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->maxbitlen = maxbitlen;
+    return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode
+{
+    int weight; /*the sum of all weights in this chain*/
+    unsigned index; /*index of this leaf node (called "count" in the paper)*/
+    struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+    int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists
+{
+    /*memory pool*/
+    unsigned memsize;
+    BPMNode* memory;
+    unsigned numfree;
+    unsigned nextfree;
+    BPMNode** freelist;
+    /*two heads of lookahead chains per list*/
+    unsigned listsize;
+    BPMNode** chains0;
+    BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail)
+{
+    unsigned i;
+    BPMNode* result;
+
+    /*memory full, so garbage collect*/
+    if (lists->nextfree >= lists->numfree)
+    {
+        /*mark only those that are in use*/
+        for (i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+        for (i = 0; i != lists->listsize; ++i)
+        {
+            BPMNode* node;
+            for (node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+            for (node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+        }
+        /*collect those that are free*/
+        lists->numfree = 0;
+        for (i = 0; i != lists->memsize; ++i)
+        {
+            if (!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+        }
+        lists->nextfree = 0;
+    }
+
+    result = lists->freelist[lists->nextfree++];
+    result->weight = weight;
+    result->index = index;
+    result->tail = tail;
+    return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num)
+{
+    BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+    size_t width, counter = 0;
+    for (width = 1; width < num; width *= 2)
+    {
+        BPMNode* a = (counter & 1) ? mem : leaves;
+        BPMNode* b = (counter & 1) ? leaves : mem;
+        size_t p;
+        for (p = 0; p < num; p += 2 * width)
+        {
+            size_t q = (p + width > num) ? num : (p + width);
+            size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+            size_t i = p, j = q, k;
+            for (k = p; k < r; k++)
+            {
+                if (i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+                else b[k] = a[j++];
+            }
+        }
+        counter++;
+    }
+    if (counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+    lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num)
+{
+    unsigned lastindex = lists->chains1[c]->index;
+
+    if (c == 0)
+    {
+        if (lastindex >= numpresent) return;
+        lists->chains0[c] = lists->chains1[c];
+        lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+    }
+    else
+    {
+        /*sum of the weights of the head nodes of the previous lookahead chains.*/
+        int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+        lists->chains0[c] = lists->chains1[c];
+        if (lastindex < numpresent && sum > leaves[lastindex].weight)
+        {
+            lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+            return;
+        }
+        lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+        /*in the end we are only interested in the chain of the last list, so no
+        need to recurse if we're at the last one (this gives measurable speedup)*/
+        if (num + 1 < (int)(2 * numpresent - 2))
+        {
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+            boundaryPM(lists, leaves, numpresent, c - 1, num);
+        }
+    }
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    unsigned i;
+    size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+    BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+    if (numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+    if ((1u << maxbitlen) < numcodes) return 80; /*error: represent all symbols*/
+
+    leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+    if (!leaves) return 83; /*alloc fail*/
+
+    for (i = 0; i != numcodes; ++i)
+    {
+        if (frequencies[i] > 0)
+        {
+            leaves[numpresent].weight = (int)frequencies[i];
+            leaves[numpresent].index = i;
+            ++numpresent;
+        }
+    }
+
+    for (i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+    /*ensure at least two present symbols. There should be at least one symbol
+    according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+    make these work as well ensure there are at least two symbols. The
+    Package-Merge code below also doesn't work correctly if there's only one
+    symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+    if (numpresent == 0)
+    {
+        lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+    }
+    else if (numpresent == 1)
+    {
+        lengths[leaves[0].index] = 1;
+        lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+    }
+    else
+    {
+        BPMLists lists;
+        BPMNode* node;
+
+        bpmnode_sort(leaves, numpresent);
+
+        lists.listsize = maxbitlen;
+        lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+        lists.nextfree = 0;
+        lists.numfree = lists.memsize;
+        lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+        lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+        lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+        if (!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            for (i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+            bpmnode_create(&lists, leaves[0].weight, 1, 0);
+            bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+            for (i = 0; i != lists.listsize; ++i)
+            {
+                lists.chains0[i] = &lists.memory[0];
+                lists.chains1[i] = &lists.memory[1];
+            }
+
+            /*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+            for (i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+            for (node = lists.chains1[maxbitlen - 1]; node; node = node->tail)
+            {
+                for (i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+            }
+        }
+
+        lodepng_free(lists.memory);
+        lodepng_free(lists.freelist);
+        lodepng_free(lists.chains0);
+        lodepng_free(lists.chains1);
+    }
+
+    lodepng_free(leaves);
+    return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+    size_t mincodes, size_t numcodes, unsigned maxbitlen)
+{
+    unsigned error = 0;
+    while (!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+    tree->maxbitlen = maxbitlen;
+    tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+    tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+    if (!tree->lengths) return 83; /*alloc fail*/
+                                   /*initialize all lengths to 0*/
+    memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+    error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+    if (!error) error = HuffmanTree_makeFromLengths2(tree);
+    return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index)
+{
+    return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index)
+{
+    return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+    for (i = 0; i <= 143; ++i) bitlen[i] = 8;
+    for (i = 144; i <= 255; ++i) bitlen[i] = 9;
+    for (i = 256; i <= 279; ++i) bitlen[i] = 7;
+    for (i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree)
+{
+    unsigned i, error = 0;
+    unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+    if (!bitlen) return 83; /*alloc fail*/
+
+                            /*there are 32 distance codes, but 30-31 are unused*/
+    for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+    error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+    lodepng_free(bitlen);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+    const HuffmanTree* codetree, size_t inbitlength)
+{
+    unsigned treepos = 0, ct;
+    for (;;)
+    {
+        if (*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+                                                       /*
+                                                       decode the symbol from the tree. The "readBitFromStream" code is inlined in
+                                                       the expression below because this is the biggest bottleneck while decoding
+                                                       */
+        ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+        ++(*bp);
+        if (ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+        else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+        if (treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+    }
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d)
+{
+    /*TODO: check for out of memory errors*/
+    generateFixedLitLenTree(tree_ll);
+    generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+    const unsigned char* in, size_t* bp, size_t inlength)
+{
+    /*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+    unsigned error = 0;
+    unsigned n, HLIT, HDIST, HCLEN, i;
+    size_t inbitlength = inlength * 8;
+
+    /*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+    unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+    unsigned* bitlen_d = 0; /*dist code lengths*/
+                            /*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+    unsigned* bitlen_cl = 0;
+    HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+    if ((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+                                                 /*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+    HLIT = readBitsFromStream(bp, in, 5) + 257;
+    /*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+    HDIST = readBitsFromStream(bp, in, 5) + 1;
+    /*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+    HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+    if ((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+    HuffmanTree_init(&tree_cl);
+
+    while (!error)
+    {
+        /*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+        bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+        if (!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+        for (i = 0; i != NUM_CODE_LENGTH_CODES; ++i)
+        {
+            if (i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+            else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+        }
+
+        error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+        if (error) break;
+
+        /*now we can use this tree to read the lengths for the tree that this function will return*/
+        bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+        bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+        if (!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+        for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+        /*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+        i = 0;
+        while (i < HLIT + HDIST)
+        {
+            unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+            if (code <= 15) /*a length code*/
+            {
+                if (i < HLIT) bitlen_ll[i] = code;
+                else bitlen_d[i - HLIT] = code;
+                ++i;
+            }
+            else if (code == 16) /*repeat previous*/
+            {
+                unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+                unsigned value; /*set value to the previous code*/
+
+                if (i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+                if ((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 2);
+
+                if (i < HLIT + 1) value = bitlen_ll[i - 1];
+                else value = bitlen_d[i - HLIT - 1];
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+                    if (i < HLIT) bitlen_ll[i] = value;
+                    else bitlen_d[i - HLIT] = value;
+                    ++i;
+                }
+            }
+            else if (code == 17) /*repeat "0" 3-10 times*/
+            {
+                unsigned replength = 3; /*read in the bits that indicate repeat length*/
+                if ((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 3);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else if (code == 18) /*repeat "0" 11-138 times*/
+            {
+                unsigned replength = 11; /*read in the bits that indicate repeat length*/
+                if ((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+                replength += readBitsFromStream(bp, in, 7);
+
+                /*repeat this value in the next lengths*/
+                for (n = 0; n < replength; ++n)
+                {
+                    if (i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+                    if (i < HLIT) bitlen_ll[i] = 0;
+                    else bitlen_d[i - HLIT] = 0;
+                    ++i;
+                }
+            }
+            else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+            {
+                if (code == (unsigned)(-1))
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inbitlength ? 10 : 11;
+                }
+                else error = 16; /*unexisting code, this can never happen*/
+                break;
+            }
+        }
+        if (error) break;
+
+        if (bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+                                                  /*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+        error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+        if (error) break;
+        error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+        break; /*end of error-while*/
+    }
+
+    lodepng_free(bitlen_cl);
+    lodepng_free(bitlen_ll);
+    lodepng_free(bitlen_d);
+    HuffmanTree_cleanup(&tree_cl);
+
+    return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+    size_t* pos, size_t inlength, unsigned btype)
+{
+    unsigned error = 0;
+    HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+    HuffmanTree tree_d; /*the huffman tree for distance codes*/
+    size_t inbitlength = inlength * 8;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    if (btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+    else if (btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+    while (!error) /*decode all symbols until end reached, breaks at end code*/
+    {
+        /*code_ll is literal, length or end code*/
+        unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+        if (code_ll <= 255) /*literal symbol*/
+        {
+            /*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+            if (!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+            out->data[*pos] = (unsigned char)code_ll;
+            ++(*pos);
+        }
+        else if (code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/
+        {
+            unsigned code_d, distance;
+            unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+            size_t start, forward, backward, length;
+
+            /*part 1: get length base*/
+            length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+            /*part 2: get extra bits and add the value of that to length*/
+            numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+            if ((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            length += readBitsFromStream(bp, in, numextrabits_l);
+
+            /*part 3: get distance code*/
+            code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+            if (code_d > 29)
+            {
+                if (code_d == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+                {
+                    /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+                    (10=no endcode, 11=wrong jump outside of tree)*/
+                    error = (*bp) > inlength * 8 ? 10 : 11;
+                }
+                else error = 18; /*error: invalid distance code (30-31 are never used)*/
+                break;
+            }
+            distance = DISTANCEBASE[code_d];
+
+            /*part 4: get extra bits from distance*/
+            numextrabits_d = DISTANCEEXTRA[code_d];
+            if ((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+            distance += readBitsFromStream(bp, in, numextrabits_d);
+
+            /*part 5: fill in all the out[n] values based on the length and dist*/
+            start = (*pos);
+            if (distance > start) ERROR_BREAK(52); /*too long backward distance*/
+            backward = start - distance;
+
+            if (!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+            if (distance < length) {
+                for (forward = 0; forward < length; ++forward)
+                {
+                    out->data[(*pos)++] = out->data[backward++];
+                }
+            }
+            else {
+                memcpy(out->data + *pos, out->data + backward, length);
+                *pos += length;
+            }
+        }
+        else if (code_ll == 256)
+        {
+            break; /*end code, break the loop*/
+        }
+        else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+        {
+            /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+            (10=no endcode, 11=wrong jump outside of tree)*/
+            error = ((*bp) > inlength * 8) ? 10 : 11;
+            break;
+        }
+    }
+
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength)
+{
+    size_t p;
+    unsigned LEN, NLEN, n, error = 0;
+
+    /*go to first boundary of byte*/
+    while (((*bp) & 0x7) != 0) ++(*bp);
+    p = (*bp) / 8; /*byte position*/
+
+                   /*read LEN (2 bytes) and NLEN (2 bytes)*/
+    if (p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+    LEN = in[p] + 256u * in[p + 1]; p += 2;
+    NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+    /*check if 16-bit NLEN is really the one's complement of LEN*/
+    if (LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+    if (!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+                                                        /*read the literal data: LEN bytes are now stored in the out buffer*/
+    if (p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+    for (n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+    (*bp) = p * 8;
+
+    return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    /*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+    size_t bp = 0;
+    unsigned BFINAL = 0;
+    size_t pos = 0; /*byte position in the out buffer*/
+    unsigned error = 0;
+
+    (void)settings;
+
+    while (!BFINAL)
+    {
+        unsigned BTYPE;
+        if (bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+        BFINAL = readBitFromStream(&bp, in);
+        BTYPE = 1u * readBitFromStream(&bp, in);
+        BTYPE += 2u * readBitFromStream(&bp, in);
+
+        if (BTYPE == 3) return 20; /*error: invalid BTYPE*/
+        else if (BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+        else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+        if (error) return error;
+    }
+
+    return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_inflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_inflate)
+    {
+        return settings->custom_inflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_inflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen)
+{
+    addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value)
+{
+    /*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+    size_t left = 1;
+    size_t right = array_size - 1;
+
+    while (left <= right) {
+        size_t mid = (left + right) >> 1;
+        if (array[mid] >= value) right = mid - 1;
+        else left = mid + 1;
+    }
+    if (left >= array_size || array[left] > value) left--;
+    return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance)
+{
+    /*values in encoded vector are those used by deflate:
+    0-255: literal bytes
+    256: end
+    257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+    286-287: invalid*/
+
+    unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+    unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+    unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+    unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+    uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+    uivector_push_back(values, extra_length);
+    uivector_push_back(values, dist_code);
+    uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash
+{
+    int* head; /*hash value to head circular pos - can be outdated if went around window*/
+               /*circular pos to prev circular pos*/
+    unsigned short* chain;
+    int* val; /*circular pos to hash value*/
+
+              /*TODO: do this not only for zeros but for any repeated byte. However for PNG
+              it's always going to be the zeros that dominate, so not important for PNG*/
+    int* headz; /*similar to head, but for chainz*/
+    unsigned short* chainz; /*those with same amount of zeros*/
+    unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize)
+{
+    unsigned i;
+    hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+    hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+    hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+    hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+    hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+    if (!hash->head || !hash->chain || !hash->val || !hash->headz || !hash->chainz || !hash->zeros)
+    {
+        return 83; /*alloc fail*/
+    }
+
+    /*initialize hash table*/
+    for (i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->val[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+    for (i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+    for (i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+    return 0;
+}
+
+static void hash_cleanup(Hash* hash)
+{
+    lodepng_free(hash->head);
+    lodepng_free(hash->val);
+    lodepng_free(hash->chain);
+
+    lodepng_free(hash->zeros);
+    lodepng_free(hash->headz);
+    lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos)
+{
+    unsigned result = 0;
+    if (pos + 2 < size)
+    {
+        /*A simple shift and xor hash is used. Since the data of PNGs is dominated
+        by zeroes due to the filters, a better hash does not have a significant
+        effect on speed in traversing the chain, and causes more time spend on
+        calculating the hash.*/
+        result ^= (unsigned)(data[pos + 0] << 0u);
+        result ^= (unsigned)(data[pos + 1] << 4u);
+        result ^= (unsigned)(data[pos + 2] << 8u);
+    }
+    else {
+        size_t amount, i;
+        if (pos >= size) return 0;
+        amount = size - pos;
+        for (i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+    }
+    return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos)
+{
+    const unsigned char* start = data + pos;
+    const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+    if (end > data + size) end = data + size;
+    data = start;
+    while (data != end && *data == 0) ++data;
+    /*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+    return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros)
+{
+    hash->val[wpos] = (int)hashval;
+    if (hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+    hash->head[hashval] = wpos;
+
+    hash->zeros[wpos] = numzeros;
+    if (hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+    hash->headz[numzeros] = wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+    const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+    unsigned minmatch, unsigned nicematch, unsigned lazymatching)
+{
+    size_t pos;
+    unsigned i, error = 0;
+    /*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+    unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+    unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+    unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+    unsigned numzeros = 0;
+
+    unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+    unsigned length;
+    unsigned lazy = 0;
+    unsigned lazylength = 0, lazyoffset = 0;
+    unsigned hashval;
+    unsigned current_offset, current_length;
+    unsigned prev_offset;
+    const unsigned char *lastptr, *foreptr, *backptr;
+    unsigned hashpos;
+
+    if (windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+    if ((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+    if (nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+    for (pos = inpos; pos < insize; ++pos)
+    {
+        size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+        unsigned chainlength = 0;
+
+        hashval = getHash(in, insize, pos);
+
+        if (usezeros && hashval == 0)
+        {
+            if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+            else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+        }
+        else
+        {
+            numzeros = 0;
+        }
+
+        updateHashChain(hash, wpos, hashval, numzeros);
+
+        /*the length and offset found for the current position*/
+        length = 0;
+        offset = 0;
+
+        hashpos = hash->chain[wpos];
+
+        lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+        /*search for the longest string*/
+        prev_offset = 0;
+        for (;;)
+        {
+            if (chainlength++ >= maxchainlength) break;
+            current_offset = hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize;
+
+            if (current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+            prev_offset = current_offset;
+            if (current_offset > 0)
+            {
+                /*test the next characters*/
+                foreptr = &in[pos];
+                backptr = &in[pos - current_offset];
+
+                /*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+                if (numzeros >= 3)
+                {
+                    unsigned skip = hash->zeros[hashpos];
+                    if (skip > numzeros) skip = numzeros;
+                    backptr += skip;
+                    foreptr += skip;
+                }
+
+                while (foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/
+                {
+                    ++backptr;
+                    ++foreptr;
+                }
+                current_length = (unsigned)(foreptr - &in[pos]);
+
+                if (current_length > length)
+                {
+                    length = current_length; /*the longest length*/
+                    offset = current_offset; /*the offset that is related to this longest length*/
+                                             /*jump out once a length of max length is found (speed gain). This also jumps
+                                             out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+                    if (current_length >= nicematch) break;
+                }
+            }
+
+            if (hashpos == hash->chain[hashpos]) break;
+
+            if (numzeros >= 3 && length > numzeros)
+            {
+                hashpos = hash->chainz[hashpos];
+                if (hash->zeros[hashpos] != numzeros) break;
+            }
+            else
+            {
+                hashpos = hash->chain[hashpos];
+                /*outdated hash value, happens if particular value was not encountered in whole last window*/
+                if (hash->val[hashpos] != (int)hashval) break;
+            }
+        }
+
+        if (lazymatching)
+        {
+            if (!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH)
+            {
+                lazy = 1;
+                lazylength = length;
+                lazyoffset = offset;
+                continue; /*try the next byte*/
+            }
+            if (lazy)
+            {
+                lazy = 0;
+                if (pos == 0) ERROR_BREAK(81);
+                if (length > lazylength + 1)
+                {
+                    /*push the previous character as literal*/
+                    if (!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+                }
+                else
+                {
+                    length = lazylength;
+                    offset = lazyoffset;
+                    hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+                    hash->headz[numzeros] = -1; /*idem*/
+                    --pos;
+                }
+            }
+        }
+        if (length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+        /*encode it as length/distance pair or literal value*/
+        if (length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
+        {
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else if (length < minmatch || (length == 3 && offset > 4096))
+        {
+            /*compensate for the fact that longer offsets have more extra bits, a
+            length of only 3 may be not worth it then*/
+            if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else
+        {
+            addLengthDistance(out, length, offset);
+            for (i = 1; i < length; ++i)
+            {
+                ++pos;
+                wpos = pos & (windowsize - 1);
+                hashval = getHash(in, insize, pos);
+                if (usezeros && hashval == 0)
+                {
+                    if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+                    else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+                }
+                else
+                {
+                    numzeros = 0;
+                }
+                updateHashChain(hash, wpos, hashval, numzeros);
+            }
+        }
+    } /*end of the loop through each character of input*/
+
+    return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize)
+{
+    /*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+    2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+    size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+    unsigned datapos = 0;
+    for (i = 0; i != numdeflateblocks; ++i)
+    {
+        unsigned BFINAL, BTYPE, LEN, NLEN;
+        unsigned char firstbyte;
+
+        BFINAL = (i == numdeflateblocks - 1);
+        BTYPE = 0;
+
+        firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+        ucvector_push_back(out, firstbyte);
+
+        LEN = 65535;
+        if (datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+        NLEN = 65535 - LEN;
+
+        ucvector_push_back(out, (unsigned char)(LEN & 255));
+        ucvector_push_back(out, (unsigned char)(LEN >> 8));
+        ucvector_push_back(out, (unsigned char)(NLEN & 255));
+        ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+        /*Decompressed data*/
+        for (j = 0; j < 65535 && datapos < datasize; ++j)
+        {
+            ucvector_push_back(out, data[datapos++]);
+        }
+    }
+
+    return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+    const HuffmanTree* tree_ll, const HuffmanTree* tree_d)
+{
+    size_t i = 0;
+    for (i = 0; i != lz77_encoded->size; ++i)
+    {
+        unsigned val = lz77_encoded->data[i];
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+        if (val > 256) /*for a length code, 3 more things have to be added*/
+        {
+            unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+            unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+            unsigned length_extra_bits = lz77_encoded->data[++i];
+
+            unsigned distance_code = lz77_encoded->data[++i];
+
+            unsigned distance_index = distance_code;
+            unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+            unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+            addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+                HuffmanTree_getLength(tree_d, distance_code));
+            addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+        }
+    }
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data, size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    unsigned error = 0;
+
+    /*
+    A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+    literal bytes and length/distance pairs. This is then huffman compressed with
+    two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+    another huffman tree is used for the dist values ("d"). These two trees are
+    stored using their code lengths, and to compress even more these code lengths
+    are also run-length encoded and huffman compressed. This gives a huffman tree
+    of code lengths "cl". The code lenghts used to describe this third tree are
+    the code length code lengths ("clcl").
+    */
+
+    /*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+    uivector lz77_encoded;
+    HuffmanTree tree_ll; /*tree for lit,len values*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+    HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+    uivector frequencies_ll; /*frequency of lit,len codes*/
+    uivector frequencies_d; /*frequency of dist codes*/
+    uivector frequencies_cl; /*frequency of code length codes*/
+    uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+    uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+                           /*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+                           (these are written as is in the file, it would be crazy to compress these using yet another huffman
+                           tree that needs to be represented by yet another set of code lengths)*/
+    uivector bitlen_cl;
+    size_t datasize = dataend - datapos;
+
+    /*
+    Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+    bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+    bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+    bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+    */
+
+    unsigned BFINAL = final;
+    size_t numcodes_ll, numcodes_d, i;
+    unsigned HLIT, HDIST, HCLEN;
+
+    uivector_init(&lz77_encoded);
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+    HuffmanTree_init(&tree_cl);
+    uivector_init(&frequencies_ll);
+    uivector_init(&frequencies_d);
+    uivector_init(&frequencies_cl);
+    uivector_init(&bitlen_lld);
+    uivector_init(&bitlen_lld_e);
+    uivector_init(&bitlen_cl);
+
+    /*This while loop never loops due to a break at the end, it is here to
+    allow breaking out of it to the cleanup phase on error conditions.*/
+    while (!error)
+    {
+        if (settings->use_lz77)
+        {
+            error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                settings->minmatch, settings->nicematch, settings->lazymatching);
+            if (error) break;
+        }
+        else
+        {
+            if (!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+            for (i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+        }
+
+        if (!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        if (!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+        /*Count the frequencies of lit, len and dist codes*/
+        for (i = 0; i != lz77_encoded.size; ++i)
+        {
+            unsigned symbol = lz77_encoded.data[i];
+            ++frequencies_ll.data[symbol];
+            if (symbol > 256)
+            {
+                unsigned dist = lz77_encoded.data[i + 2];
+                ++frequencies_d.data[dist];
+                i += 3;
+            }
+        }
+        frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+                                      /*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+        error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+        if (error) break;
+        /*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+        error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+        if (error) break;
+
+        numcodes_ll = tree_ll.numcodes; if (numcodes_ll > 286) numcodes_ll = 286;
+        numcodes_d = tree_d.numcodes; if (numcodes_d > 30) numcodes_d = 30;
+        /*store the code lengths of both generated trees in bitlen_lld*/
+        for (i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+        for (i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+        /*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+        17 (3-10 zeroes), 18 (11-138 zeroes)*/
+        for (i = 0; i != (unsigned)bitlen_lld.size; ++i)
+        {
+            unsigned j = 0; /*amount of repititions*/
+            while (i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+            if (bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/
+            {
+                ++j; /*include the first zero*/
+                if (j <= 10) /*repeat code 17 supports max 10 zeroes*/
+                {
+                    uivector_push_back(&bitlen_lld_e, 17);
+                    uivector_push_back(&bitlen_lld_e, j - 3);
+                }
+                else /*repeat code 18 supports max 138 zeroes*/
+                {
+                    if (j > 138) j = 138;
+                    uivector_push_back(&bitlen_lld_e, 18);
+                    uivector_push_back(&bitlen_lld_e, j - 11);
+                }
+                i += (j - 1);
+            }
+            else if (j >= 3) /*repeat code for value other than zero*/
+            {
+                size_t k;
+                unsigned num = j / 6, rest = j % 6;
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+                for (k = 0; k < num; ++k)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, 6 - 3);
+                }
+                if (rest >= 3)
+                {
+                    uivector_push_back(&bitlen_lld_e, 16);
+                    uivector_push_back(&bitlen_lld_e, rest - 3);
+                }
+                else j -= rest;
+                i += j;
+            }
+            else /*too short to benefit from repeat code*/
+            {
+                uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+            }
+        }
+
+        /*generate tree_cl, the huffmantree of huffmantrees*/
+
+        if (!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            ++frequencies_cl.data[bitlen_lld_e.data[i]];
+            /*after a repeat code come the bits that specify the number of repetitions,
+            those don't need to be in the frequencies_cl calculation*/
+            if (bitlen_lld_e.data[i] >= 16) ++i;
+        }
+
+        error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+            frequencies_cl.size, frequencies_cl.size, 7);
+        if (error) break;
+
+        if (!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+        for (i = 0; i != tree_cl.numcodes; ++i)
+        {
+            /*lenghts of code length tree is in the order as specified by deflate*/
+            bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+        }
+        while (bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4)
+        {
+            /*remove zeros at the end, but minimum size must be 4*/
+            if (!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        if (error) break;
+
+        /*
+        Write everything into the output
+
+        After the BFINAL and BTYPE, the dynamic block consists out of the following:
+        - 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+        - (HCLEN+4)*3 bits code lengths of code length alphabet
+        - HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - HDIST + 1 code lengths of distance alphabet (encoded using the code length
+        alphabet, + possible repetition codes 16, 17, 18)
+        - compressed data
+        - 256 (end code)
+        */
+
+        /*Write block type*/
+        addBitToStream(bp, out, BFINAL);
+        addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+        addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+                                    /*write the HLIT, HDIST and HCLEN values*/
+        HLIT = (unsigned)(numcodes_ll - 257);
+        HDIST = (unsigned)(numcodes_d - 1);
+        HCLEN = (unsigned)bitlen_cl.size - 4;
+        /*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+        while (!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+        addBitsToStream(bp, out, HLIT, 5);
+        addBitsToStream(bp, out, HDIST, 5);
+        addBitsToStream(bp, out, HCLEN, 4);
+
+        /*write the code lenghts of the code length alphabet*/
+        for (i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+        /*write the lenghts of the lit/len AND the dist alphabet*/
+        for (i = 0; i != bitlen_lld_e.size; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+                HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+            /*extra bits of repeat codes*/
+            if (bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+            else if (bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+            else if (bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+        }
+
+        /*write the compressed data symbols*/
+        writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        /*error: the length of the end code 256 must be larger than 0*/
+        if (HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+        /*write the end code*/
+        addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+        break; /*end of error-while*/
+    }
+
+    /*cleanup*/
+    uivector_cleanup(&lz77_encoded);
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+    HuffmanTree_cleanup(&tree_cl);
+    uivector_cleanup(&frequencies_ll);
+    uivector_cleanup(&frequencies_d);
+    uivector_cleanup(&frequencies_cl);
+    uivector_cleanup(&bitlen_lld_e);
+    uivector_cleanup(&bitlen_lld);
+    uivector_cleanup(&bitlen_cl);
+
+    return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+    const unsigned char* data,
+    size_t datapos, size_t dataend,
+    const LodePNGCompressSettings* settings, unsigned final)
+{
+    HuffmanTree tree_ll; /*tree for literal values and length codes*/
+    HuffmanTree tree_d; /*tree for distance codes*/
+
+    unsigned BFINAL = final;
+    unsigned error = 0;
+    size_t i;
+
+    HuffmanTree_init(&tree_ll);
+    HuffmanTree_init(&tree_d);
+
+    generateFixedLitLenTree(&tree_ll);
+    generateFixedDistanceTree(&tree_d);
+
+    addBitToStream(bp, out, BFINAL);
+    addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+    addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+    if (settings->use_lz77) /*LZ77 encoded*/
+    {
+        uivector lz77_encoded;
+        uivector_init(&lz77_encoded);
+        error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+            settings->minmatch, settings->nicematch, settings->lazymatching);
+        if (!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+        uivector_cleanup(&lz77_encoded);
+    }
+    else /*no LZ77, but still will be Huffman compressed*/
+    {
+        for (i = datapos; i < dataend; ++i)
+        {
+            addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+        }
+    }
+    /*add END code*/
+    if (!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+    /*cleanup*/
+    HuffmanTree_cleanup(&tree_ll);
+    HuffmanTree_cleanup(&tree_d);
+
+    return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error = 0;
+    size_t i, blocksize, numdeflateblocks;
+    size_t bp = 0; /*the bit pointer*/
+    Hash hash;
+
+    if (settings->btype > 2) return 61;
+    else if (settings->btype == 0) return deflateNoCompression(out, in, insize);
+    else if (settings->btype == 1) blocksize = insize;
+    else /*if(settings->btype == 2)*/
+    {
+        /*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+        blocksize = insize / 8 + 8;
+        if (blocksize < 65536) blocksize = 65536;
+        if (blocksize > 262144) blocksize = 262144;
+    }
+
+    numdeflateblocks = (insize + blocksize - 1) / blocksize;
+    if (numdeflateblocks == 0) numdeflateblocks = 1;
+
+    error = hash_init(&hash, settings->windowsize);
+    if (error) return error;
+
+    for (i = 0; i != numdeflateblocks && !error; ++i)
+    {
+        unsigned final = (i == numdeflateblocks - 1);
+        size_t start = i * blocksize;
+        size_t end = start + blocksize;
+        if (end > insize) end = insize;
+
+        if (settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+        else if (settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+    }
+
+    hash_cleanup(&hash);
+
+    return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    unsigned error;
+    ucvector v;
+    ucvector_init_buffer(&v, *out, *outsize);
+    error = lodepng_deflatev(&v, in, insize, settings);
+    *out = v.data;
+    *outsize = v.size;
+    return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_deflate)
+    {
+        return settings->custom_deflate(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_deflate(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len)
+{
+    unsigned s1 = adler & 0xffff;
+    unsigned s2 = (adler >> 16) & 0xffff;
+
+    while (len > 0)
+    {
+        /*at least 5550 sums can be done before the sums overflow, saving a lot of module divisions*/
+        unsigned amount = len > 5550 ? 5550 : len;
+        len -= amount;
+        while (amount > 0)
+        {
+            s1 += (*data++);
+            s2 += s1;
+            --amount;
+        }
+        s1 %= 65521;
+        s2 %= 65521;
+    }
+
+    return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len)
+{
+    return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    unsigned error = 0;
+    unsigned CM, CINFO, FDICT;
+
+    if (insize < 2) return 53; /*error, size of zlib data too small*/
+                               /*read information from zlib header*/
+    if ((in[0] * 256 + in[1]) % 31 != 0)
+    {
+        /*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+        return 24;
+    }
+
+    CM = in[0] & 15;
+    CINFO = (in[0] >> 4) & 15;
+    /*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+    FDICT = (in[1] >> 5) & 1;
+    /*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+    if (CM != 8 || CINFO > 7)
+    {
+        /*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+        return 25;
+    }
+    if (FDICT != 0)
+    {
+        /*error: the specification of PNG says about the zlib stream:
+        "The additional flags shall not specify a preset dictionary."*/
+        return 26;
+    }
+
+    error = inflate(out, outsize, in + 2, insize - 2, settings);
+    if (error) return error;
+
+    if (!settings->ignore_adler32)
+    {
+        unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+        unsigned checksum = adler32(*out, (unsigned)(*outsize));
+        if (checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+    }
+
+    return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    /*initially, *out must be NULL and outsize 0, if you just give some random *out
+    that's pointing to a non allocated buffer, this'll crash*/
+    ucvector outv;
+    size_t i;
+    unsigned error;
+    unsigned char* deflatedata = 0;
+    size_t deflatesize = 0;
+
+    /*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+    unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+    unsigned FLEVEL = 0;
+    unsigned FDICT = 0;
+    unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+    unsigned FCHECK = 31 - CMFFLG % 31;
+    CMFFLG += FCHECK;
+
+    /*ucvector-controlled version of the output buffer, for dynamic array*/
+    ucvector_init_buffer(&outv, *out, *outsize);
+
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+    ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+    error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+    if (!error)
+    {
+        unsigned ADLER32 = adler32(in, (unsigned)insize);
+        for (i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+        lodepng_free(deflatedata);
+        lodepng_add32bitInt(&outv, ADLER32);
+    }
+
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (settings->custom_zlib)
+    {
+        return settings->custom_zlib(out, outsize, in, insize, settings);
+    }
+    else
+    {
+        return lodepng_zlib_compress(out, outsize, in, insize, settings);
+    }
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGDecompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+    size_t insize, const LodePNGCompressSettings* settings)
+{
+    if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings)
+{
+    /*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+    settings->btype = 2;
+    settings->use_lz77 = 1;
+    settings->windowsize = DEFAULT_WINDOWSIZE;
+    settings->minmatch = 3;
+    settings->nicematch = 128;
+    settings->lazymatching = 1;
+
+    settings->custom_zlib = 0;
+    settings->custom_deflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = { 2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0 };
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings)
+{
+    settings->ignore_adler32 = 0;
+
+    settings->custom_zlib = 0;
+    settings->custom_inflate = 0;
+    settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = { 0, 0, 0, 0 };
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+    0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+    249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+    498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+    325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+    997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+    901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+    651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+    671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+    1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+    2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+    1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+    1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+    1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+    1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+    1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+    1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+    3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+    3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+    4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+    4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+    3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+    3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+    3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+    3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+    2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+    2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+    2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+    2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+    2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+    2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+    3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+    3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length)
+{
+    unsigned r = 0xffffffffu;
+    size_t i;
+    for (i = 0; i < length; ++i)
+    {
+        r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+    }
+    return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+    unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+    ++(*bitpointer);
+    return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+    unsigned result = 0;
+    size_t i;
+    for (i = 0; i < nbits; ++i)
+    {
+        result <<= 1;
+        result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+    }
+    return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream must be 0 for this to work*/
+    if (bit)
+    {
+        /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+        bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+    }
+    ++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+    /*the current bit in bitstream may be 0 or 1 for this to work*/
+    if (bit == 0) bitstream[(*bitpointer) >> 3] &= (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+    else         bitstream[(*bitpointer) >> 3] |= (1 << (7 - ((*bitpointer) & 0x7)));
+    ++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk)
+{
+    return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk)
+{
+    unsigned i;
+    for (i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+    type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type)
+{
+    if (strlen(type) != 4) return 0;
+    return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk)
+{
+    return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk)
+{
+    return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk)
+{
+    return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk)
+{
+    return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+    /*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+    unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+    if (CRC != checksum) return 1;
+    else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk)
+{
+    unsigned length = lodepng_chunk_length(chunk);
+    unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+    lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk)
+{
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return &chunk[total_chunk_length];
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk)
+{
+    unsigned i;
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    unsigned char *chunk_start, *new_buffer;
+    size_t new_length = (*outlength) + total_chunk_length;
+    if (new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk_start = &(*out)[new_length - total_chunk_length];
+
+    for (i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+    return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data)
+{
+    unsigned i;
+    unsigned char *chunk, *new_buffer;
+    size_t new_length = (*outlength) + length + 12;
+    if (new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+    new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+    if (!new_buffer) return 83; /*alloc fail*/
+    (*out) = new_buffer;
+    (*outlength) = new_length;
+    chunk = &(*out)[(*outlength) - length - 12];
+
+    /*1: length*/
+    lodepng_set32bitInt(chunk, (unsigned)length);
+
+    /*2: chunk name (4 letters)*/
+    chunk[4] = (unsigned char)type[0];
+    chunk[5] = (unsigned char)type[1];
+    chunk[6] = (unsigned char)type[2];
+    chunk[7] = (unsigned char)type[3];
+
+    /*3: the data*/
+    for (i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+    /*4: CRC (of the chunkname characters and the data)*/
+    lodepng_chunk_generate_crc(chunk);
+
+    return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/
+{
+    switch (colortype)
+    {
+    case 0: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*grey*/
+    case 2: if (!(bd == 8 || bd == 16)) return 37; break; /*RGB*/
+    case 3: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8)) return 37; break; /*palette*/
+    case 4: if (!(bd == 8 || bd == 16)) return 37; break; /*grey + alpha*/
+    case 6: if (!(bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+    default: return 31;
+    }
+    return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype)
+{
+    switch (colortype)
+    {
+    case 0: return 1; /*grey*/
+    case 2: return 3; /*RGB*/
+    case 3: return 1; /*palette*/
+    case 4: return 2; /*grey + alpha*/
+    case 6: return 4; /*RGBA*/
+    }
+    return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*bits per pixel is amount of channels * bits per channel*/
+    return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info)
+{
+    info->key_defined = 0;
+    info->key_r = info->key_g = info->key_b = 0;
+    info->colortype = LCT_RGBA;
+    info->bitdepth = 8;
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info)
+{
+    lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source)
+{
+    size_t i;
+    lodepng_color_mode_cleanup(dest);
+    *dest = *source;
+    if (source->palette)
+    {
+        dest->palette = (unsigned char*)lodepng_malloc(1024);
+        if (!dest->palette && source->palettesize) return 83; /*alloc fail*/
+        for (i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+    }
+    return 0;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b)
+{
+    size_t i;
+    if (a->colortype != b->colortype) return 0;
+    if (a->bitdepth != b->bitdepth) return 0;
+    if (a->key_defined != b->key_defined) return 0;
+    if (a->key_defined)
+    {
+        if (a->key_r != b->key_r) return 0;
+        if (a->key_g != b->key_g) return 0;
+        if (a->key_b != b->key_b) return 0;
+    }
+    /*if one of the palette sizes is 0, then we consider it to be the same as the
+    other: it means that e.g. the palette was not given by the user and should be
+    considered the same as the palette inside the PNG.*/
+    if (1/*a->palettesize != 0 && b->palettesize != 0*/) {
+        if (a->palettesize != b->palettesize) return 0;
+        for (i = 0; i != a->palettesize * 4; ++i)
+        {
+            if (a->palette[i] != b->palette[i]) return 0;
+        }
+    }
+    return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info)
+{
+    if (info->palette) lodepng_free(info->palette);
+    info->palette = 0;
+    info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    unsigned char* data;
+    /*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+    the max of 256 colors, it'll have the exact alloc size*/
+    if (!info->palette) /*allocate palette if empty*/
+    {
+        /*room for 256 colors with 4 bytes each*/
+        data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+        if (!data) return 83; /*alloc fail*/
+        else info->palette = data;
+    }
+    info->palette[4 * info->palettesize + 0] = r;
+    info->palette[4 * info->palettesize + 1] = g;
+    info->palette[4 * info->palettesize + 2] = b;
+    info->palette[4 * info->palettesize + 3] = a;
+    ++info->palettesize;
+    return 0;
+}
+
+unsigned lodepng_get_bpp(const LodePNGColorMode* info)
+{
+    /*calculate bits per pixel out of colortype and bitdepth*/
+    return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info)
+{
+    return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info)
+{
+    return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info)
+{
+    return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info)
+{
+    size_t i;
+    for (i = 0; i != info->palettesize; ++i)
+    {
+        if (info->palette[i * 4 + 3] < 255) return 1;
+    }
+    return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info)
+{
+    return info->key_defined
+        || lodepng_is_alpha_type(info)
+        || lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+    size_t n = w * h;
+    return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer*/
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+    /*will not overflow for any color type if roughly w * h < 268435455*/
+    size_t bpp = lodepng_get_bpp(color);
+    size_t line = ((w / 8) * bpp) + ((w & 7) * bpp + 7) / 8;
+    return h * line;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+    for (i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info)
+{
+    unsigned i;
+    for (i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src)
+{
+    unsigned i;
+
+    LodePNGUnknownChunks_cleanup(dest);
+
+    for (i = 0; i != 3; ++i)
+    {
+        size_t j;
+        dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+        dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+        if (!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+        for (j = 0; j < src->unknown_chunks_size[i]; ++j)
+        {
+            dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+        }
+    }
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info)
+{
+    info->text_num = 0;
+    info->text_keys = NULL;
+    info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->text_num; ++i)
+    {
+        string_cleanup(&info->text_keys[i]);
+        string_cleanup(&info->text_strings[i]);
+    }
+    lodepng_free(info->text_keys);
+    lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->text_keys = 0;
+    dest->text_strings = 0;
+    dest->text_num = 0;
+    for (i = 0; i != source->text_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info)
+{
+    LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+    if (!new_keys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->text_num;
+    info->text_keys = new_keys;
+    info->text_strings = new_strings;
+
+    string_init(&info->text_keys[info->text_num - 1]);
+    string_set(&info->text_keys[info->text_num - 1], key);
+
+    string_init(&info->text_strings[info->text_num - 1]);
+    string_set(&info->text_strings[info->text_num - 1], str);
+
+    return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info)
+{
+    info->itext_num = 0;
+    info->itext_keys = NULL;
+    info->itext_langtags = NULL;
+    info->itext_transkeys = NULL;
+    info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info)
+{
+    size_t i;
+    for (i = 0; i != info->itext_num; ++i)
+    {
+        string_cleanup(&info->itext_keys[i]);
+        string_cleanup(&info->itext_langtags[i]);
+        string_cleanup(&info->itext_transkeys[i]);
+        string_cleanup(&info->itext_strings[i]);
+    }
+    lodepng_free(info->itext_keys);
+    lodepng_free(info->itext_langtags);
+    lodepng_free(info->itext_transkeys);
+    lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    size_t i = 0;
+    dest->itext_keys = 0;
+    dest->itext_langtags = 0;
+    dest->itext_transkeys = 0;
+    dest->itext_strings = 0;
+    dest->itext_num = 0;
+    for (i = 0; i != source->itext_num; ++i)
+    {
+        CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+            source->itext_transkeys[i], source->itext_strings[i]));
+    }
+    return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info)
+{
+    LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str)
+{
+    char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+    char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+    char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+    if (!new_keys || !new_langtags || !new_transkeys || !new_strings)
+    {
+        lodepng_free(new_keys);
+        lodepng_free(new_langtags);
+        lodepng_free(new_transkeys);
+        lodepng_free(new_strings);
+        return 83; /*alloc fail*/
+    }
+
+    ++info->itext_num;
+    info->itext_keys = new_keys;
+    info->itext_langtags = new_langtags;
+    info->itext_transkeys = new_transkeys;
+    info->itext_strings = new_strings;
+
+    string_init(&info->itext_keys[info->itext_num - 1]);
+    string_set(&info->itext_keys[info->itext_num - 1], key);
+
+    string_init(&info->itext_langtags[info->itext_num - 1]);
+    string_set(&info->itext_langtags[info->itext_num - 1], langtag);
+
+    string_init(&info->itext_transkeys[info->itext_num - 1]);
+    string_set(&info->itext_transkeys[info->itext_num - 1], transkey);
+
+    string_init(&info->itext_strings[info->itext_num - 1]);
+    string_set(&info->itext_strings[info->itext_num - 1], str);
+
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info)
+{
+    lodepng_color_mode_init(&info->color);
+    info->interlace_method = 0;
+    info->compression_method = 0;
+    info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    info->background_defined = 0;
+    info->background_r = info->background_g = info->background_b = 0;
+
+    LodePNGText_init(info);
+    LodePNGIText_init(info);
+
+    info->time_defined = 0;
+    info->phys_defined = 0;
+
+    LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info)
+{
+    lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    LodePNGText_cleanup(info);
+    LodePNGIText_cleanup(info);
+
+    LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+    lodepng_info_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->color);
+    CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+    CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+
+    LodePNGUnknownChunks_init(dest);
+    CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    return 0;
+}
+
+void lodepng_info_swap(LodePNGInfo* a, LodePNGInfo* b)
+{
+    LodePNGInfo temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in)
+{
+    unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+                                                    /*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+    unsigned p = index & m;
+    in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+    in = in << (bits * (m - p));
+    if (p == 0) out[index * bits / 8] = in;
+    else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree
+{
+    ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+    int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i) tree->children[i] = 0;
+    tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree)
+{
+    int i;
+    for (i = 0; i != 16; ++i)
+    {
+        if (tree->children[i])
+        {
+            color_tree_cleanup(tree->children[i]);
+            lodepng_free(tree->children[i]);
+        }
+    }
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    int bit = 0;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i]) return -1;
+        else tree = tree->children[i];
+    }
+    return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index)
+{
+    int bit;
+    for (bit = 0; bit < 8; ++bit)
+    {
+        int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+        if (!tree->children[i])
+        {
+            tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+            color_tree_init(tree->children[i]);
+        }
+        tree = tree->children[i];
+    }
+    tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8) out[i] = grey;
+        else if (mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = grey;
+        else
+        {
+            /*take the most significant bits of grey*/
+            grey = (grey >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+            addColorBits(out, i, mode->bitdepth, grey);
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 3 + 0] = r;
+            out[i * 3 + 1] = g;
+            out[i * 3 + 2] = b;
+        }
+        else
+        {
+            out[i * 6 + 0] = out[i * 6 + 1] = r;
+            out[i * 6 + 2] = out[i * 6 + 3] = g;
+            out[i * 6 + 4] = out[i * 6 + 5] = b;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        int index = color_tree_get(tree, r, g, b, a);
+        if (index < 0) return 82; /*color not in palette*/
+        if (mode->bitdepth == 8) out[i] = index;
+        else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+        if (mode->bitdepth == 8)
+        {
+            out[i * 2 + 0] = grey;
+            out[i * 2 + 1] = a;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            out[i * 4 + 0] = out[i * 4 + 1] = grey;
+            out[i * 4 + 2] = out[i * 4 + 3] = a;
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            out[i * 4 + 0] = r;
+            out[i * 4 + 1] = g;
+            out[i * 4 + 2] = b;
+            out[i * 4 + 3] = a;
+        }
+        else
+        {
+            out[i * 8 + 0] = out[i * 8 + 1] = r;
+            out[i * 8 + 2] = out[i * 8 + 3] = g;
+            out[i * 8 + 4] = out[i * 8 + 5] = b;
+            out[i * 8 + 6] = out[i * 8 + 7] = a;
+        }
+    }
+
+    return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+    const LodePNGColorMode* mode,
+    unsigned short r, unsigned short g, unsigned short b, unsigned short a)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 2 + 0] = (grey >> 8) & 255;
+        out[i * 2 + 1] = grey & 255;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        out[i * 6 + 0] = (r >> 8) & 255;
+        out[i * 6 + 1] = r & 255;
+        out[i * 6 + 2] = (g >> 8) & 255;
+        out[i * 6 + 3] = g & 255;
+        out[i * 6 + 4] = (b >> 8) & 255;
+        out[i * 6 + 5] = b & 255;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+        out[i * 4 + 0] = (grey >> 8) & 255;
+        out[i * 4 + 1] = grey & 255;
+        out[i * 4 + 2] = (a >> 8) & 255;
+        out[i * 4 + 3] = a & 255;
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        out[i * 8 + 0] = (r >> 8) & 255;
+        out[i * 8 + 1] = r & 255;
+        out[i * 8 + 2] = (g >> 8) & 255;
+        out[i * 8 + 3] = g & 255;
+        out[i * 8 + 4] = (b >> 8) & 255;
+        out[i * 8 + 5] = b & 255;
+        out[i * 8 + 6] = (a >> 8) & 255;
+        out[i * 8 + 7] = a & 255;
+    }
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+    unsigned char* b, unsigned char* a,
+    const unsigned char* in, size_t i,
+    const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i];
+            if (mode->key_defined && *r == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else if (mode->bitdepth == 16)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = i * mode->bitdepth;
+            unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+            *r = *g = *b = (value * 255) / highest;
+            if (mode->key_defined && value == mode->key_r) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+            if (mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+        else
+        {
+            *r = in[i * 6 + 0];
+            *g = in[i * 6 + 2];
+            *b = in[i * 6 + 4];
+            if (mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+            else *a = 255;
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        if (mode->bitdepth == 8) index = in[i];
+        else
+        {
+            size_t j = i * mode->bitdepth;
+            index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+        }
+
+        if (index >= mode->palettesize)
+        {
+            /*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+            Done here too, slightly faster due to no error handling needed.*/
+            *r = *g = *b = 0;
+            *a = 255;
+        }
+        else
+        {
+            *r = mode->palette[index * 4 + 0];
+            *g = mode->palette[index * 4 + 1];
+            *b = mode->palette[index * 4 + 2];
+            *a = mode->palette[index * 4 + 3];
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = *g = *b = in[i * 2 + 0];
+            *a = in[i * 2 + 1];
+        }
+        else
+        {
+            *r = *g = *b = in[i * 4 + 0];
+            *a = in[i * 4 + 2];
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            *r = in[i * 4 + 0];
+            *g = in[i * 4 + 1];
+            *b = in[i * 4 + 2];
+            *a = in[i * 4 + 3];
+        }
+        else
+        {
+            *r = in[i * 8 + 0];
+            *g = in[i * 8 + 2];
+            *b = in[i * 8 + 4];
+            *a = in[i * 8 + 6];
+        }
+    }
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+    unsigned has_alpha, const unsigned char* in,
+    const LodePNGColorMode* mode)
+{
+    unsigned num_channels = has_alpha ? 4 : 3;
+    size_t i;
+    if (mode->colortype == LCT_GREY)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i];
+                if (has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+            }
+        }
+        else if (mode->bitdepth == 16)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+                if (has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+            }
+        }
+        else
+        {
+            unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+            size_t j = 0;
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+                buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+                if (has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 3 + 0];
+                buffer[1] = in[i * 3 + 1];
+                buffer[2] = in[i * 3 + 2];
+                if (has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+                    && buffer[1] == mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 6 + 0];
+                buffer[1] = in[i * 6 + 2];
+                buffer[2] = in[i * 6 + 4];
+                if (has_alpha) buffer[3] = mode->key_defined
+                    && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+                    && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+                    && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+            }
+        }
+    }
+    else if (mode->colortype == LCT_PALETTE)
+    {
+        unsigned index;
+        size_t j = 0;
+        for (i = 0; i != numpixels; ++i, buffer += num_channels)
+        {
+            if (mode->bitdepth == 8) index = in[i];
+            else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+            if (index >= mode->palettesize)
+            {
+                /*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+                Done here too, slightly faster due to no error handling needed.*/
+                buffer[0] = buffer[1] = buffer[2] = 0;
+                if (has_alpha) buffer[3] = 255;
+            }
+            else
+            {
+                buffer[0] = mode->palette[index * 4 + 0];
+                buffer[1] = mode->palette[index * 4 + 1];
+                buffer[2] = mode->palette[index * 4 + 2];
+                if (has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+                if (has_alpha) buffer[3] = in[i * 2 + 1];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+                if (has_alpha) buffer[3] = in[i * 4 + 2];
+            }
+        }
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        if (mode->bitdepth == 8)
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 4 + 0];
+                buffer[1] = in[i * 4 + 1];
+                buffer[2] = in[i * 4 + 2];
+                if (has_alpha) buffer[3] = in[i * 4 + 3];
+            }
+        }
+        else
+        {
+            for (i = 0; i != numpixels; ++i, buffer += num_channels)
+            {
+                buffer[0] = in[i * 8 + 0];
+                buffer[1] = in[i * 8 + 2];
+                buffer[2] = in[i * 8 + 4];
+                if (has_alpha) buffer[3] = in[i * 8 + 6];
+            }
+        }
+    }
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+    const unsigned char* in, size_t i, const LodePNGColorMode* mode)
+{
+    if (mode->colortype == LCT_GREY)
+    {
+        *r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+        if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_RGB)
+    {
+        *r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+        *g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+        *b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+        if (mode->key_defined
+            && 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+            && 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+            && 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+        else *a = 65535;
+    }
+    else if (mode->colortype == LCT_GREY_ALPHA)
+    {
+        *r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+        *a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+    }
+    else if (mode->colortype == LCT_RGBA)
+    {
+        *r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+        *g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+        *b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+        *a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+    }
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h)
+{
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+    unsigned error = 0;
+
+    if (lodepng_color_mode_equal(mode_out, mode_in))
+    {
+        size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+        for (i = 0; i != numbytes; ++i) out[i] = in[i];
+        return 0;
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        size_t palettesize = mode_out->palettesize;
+        const unsigned char* palette = mode_out->palette;
+        size_t palsize = 1u << mode_out->bitdepth;
+        /*if the user specified output palette but did not give the values, assume
+        they want the values of the input color type (assuming that one is palette).
+        Note that we never create a new palette ourselves.*/
+        if (palettesize == 0)
+        {
+            palettesize = mode_in->palettesize;
+            palette = mode_in->palette;
+        }
+        if (palettesize < palsize) palsize = palettesize;
+        color_tree_init(&tree);
+        for (i = 0; i != palsize; ++i)
+        {
+            const unsigned char* p = &palette[i * 4];
+            color_tree_add(&tree, p[0], p[1], p[2], p[3], i);
+        }
+    }
+
+    if (mode_in->bitdepth == 16 && mode_out->bitdepth == 16)
+    {
+        for (i = 0; i != numpixels; ++i)
+        {
+            unsigned short r = 0, g = 0, b = 0, a = 0;
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+            rgba16ToPixel(out, i, mode_out, r, g, b, a);
+        }
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA)
+    {
+        getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+    }
+    else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB)
+    {
+        getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+    }
+    else
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+            error = rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a);
+            if (error) break;
+        }
+    }
+
+    if (mode_out->colortype == LCT_PALETTE)
+    {
+        color_tree_cleanup(&tree);
+    }
+
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile)
+{
+    profile->colored = 0;
+    profile->key = 0;
+    profile->key_r = profile->key_g = profile->key_b = 0;
+    profile->alpha = 0;
+    profile->numcolors = 0;
+    profile->bits = 1;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p)
+{
+std::cout << "colored: " << (int)p->colored << ", ";
+std::cout << "key: " << (int)p->key << ", ";
+std::cout << "key_r: " << (int)p->key_r << ", ";
+std::cout << "key_g: " << (int)p->key_g << ", ";
+std::cout << "key_b: " << (int)p->key_b << ", ";
+std::cout << "alpha: " << (int)p->alpha << ", ";
+std::cout << "numcolors: " << (int)p->numcolors << ", ";
+std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value)
+{
+    if (value == 0 || value == 255) return 1;
+    /*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+    if (value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+    return 8;
+}
+
+/*profile must already have been inited with mode.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* mode)
+{
+    unsigned error = 0;
+    size_t i;
+    ColorTree tree;
+    size_t numpixels = w * h;
+
+    unsigned colored_done = lodepng_is_greyscale_type(mode) ? 1 : 0;
+    unsigned alpha_done = lodepng_can_have_alpha(mode) ? 0 : 1;
+    unsigned numcolors_done = 0;
+    unsigned bpp = lodepng_get_bpp(mode);
+    unsigned bits_done = bpp == 1 ? 1 : 0;
+    unsigned maxnumcolors = 257;
+    unsigned sixteen = 0;
+    if (bpp <= 8) maxnumcolors = bpp == 1 ? 2 : (bpp == 2 ? 4 : (bpp == 4 ? 16 : 256));
+
+    color_tree_init(&tree);
+
+    /*Check if the 16-bit input is truly 16-bit*/
+    if (mode->bitdepth == 16)
+    {
+        unsigned short r, g, b, a;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+            if ((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+                (b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/
+            {
+                sixteen = 1;
+                break;
+            }
+        }
+    }
+
+    if (sixteen)
+    {
+        unsigned short r = 0, g = 0, b = 0, a = 0;
+        profile->bits = 16;
+        bits_done = numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 65535 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 65535 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                }
+            }
+        }
+    }
+    else /* < 16-bit */
+    {
+        unsigned char r = 0, g = 0, b = 0, a = 0;
+        for (i = 0; i != numpixels; ++i)
+        {
+            getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+
+            if (!bits_done && profile->bits < 8)
+            {
+                /*only r is checked, < 8 bits is only relevant for greyscale*/
+                unsigned bits = getValueRequiredBits(r);
+                if (bits > profile->bits) profile->bits = bits;
+            }
+            bits_done = (profile->bits >= bpp);
+
+            if (!colored_done && (r != g || r != b))
+            {
+                profile->colored = 1;
+                colored_done = 1;
+                if (profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+            }
+
+            if (!alpha_done)
+            {
+                unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+                if (a != 255 && (a != 0 || (profile->key && !matchkey)))
+                {
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+                else if (a == 0 && !profile->alpha && !profile->key)
+                {
+                    profile->key = 1;
+                    profile->key_r = r;
+                    profile->key_g = g;
+                    profile->key_b = b;
+                }
+                else if (a == 255 && profile->key && matchkey)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+
+            if (!numcolors_done)
+            {
+                if (!color_tree_has(&tree, r, g, b, a))
+                {
+                    color_tree_add(&tree, r, g, b, a, profile->numcolors);
+                    if (profile->numcolors < 256)
+                    {
+                        unsigned char* p = profile->palette;
+                        unsigned n = profile->numcolors;
+                        p[n * 4 + 0] = r;
+                        p[n * 4 + 1] = g;
+                        p[n * 4 + 2] = b;
+                        p[n * 4 + 3] = a;
+                    }
+                    ++profile->numcolors;
+                    numcolors_done = profile->numcolors >= maxnumcolors;
+                }
+            }
+
+            if (alpha_done && numcolors_done && colored_done && bits_done) break;
+        }
+
+        if (profile->key && !profile->alpha)
+        {
+            for (i = 0; i != numpixels; ++i)
+            {
+                getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+                if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+                {
+                    /* Color key cannot be used if an opaque pixel also has that RGB color. */
+                    profile->alpha = 1;
+                    profile->key = 0;
+                    alpha_done = 1;
+                    if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+                }
+            }
+        }
+
+        /*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+        profile->key_r += (profile->key_r << 8);
+        profile->key_g += (profile->key_g << 8);
+        profile->key_b += (profile->key_b << 8);
+    }
+
+    color_tree_cleanup(&tree);
+    return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. grey if there are only greyscale pixels, palette if there
+are less than 256 colors, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in)
+{
+    LodePNGColorProfile prof;
+    unsigned error = 0;
+    unsigned i, n, palettebits, palette_ok;
+
+    lodepng_color_profile_init(&prof);
+    error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+    if (error) return error;
+    mode_out->key_defined = 0;
+
+    if (prof.key && w * h <= 16)
+    {
+        prof.alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+        prof.key = 0;
+        if (prof.bits < 8) prof.bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+    }
+    n = prof.numcolors;
+    palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+    palette_ok = n <= 256 && prof.bits <= 8;
+    if (w * h < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+    if (!prof.colored && prof.bits <= palettebits) palette_ok = 0; /*grey is less overhead*/
+
+    if (palette_ok)
+    {
+        unsigned char* p = prof.palette;
+        lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+        for (i = 0; i != prof.numcolors; ++i)
+        {
+            error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+            if (error) break;
+        }
+
+        mode_out->colortype = LCT_PALETTE;
+        mode_out->bitdepth = palettebits;
+
+        if (mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+            && mode_in->bitdepth == mode_out->bitdepth)
+        {
+            /*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+            lodepng_color_mode_cleanup(mode_out);
+            lodepng_color_mode_copy(mode_out, mode_in);
+        }
+    }
+    else /*8-bit or 16-bit per channel*/
+    {
+        mode_out->bitdepth = prof.bits;
+        mode_out->colortype = prof.alpha ? (prof.colored ? LCT_RGBA : LCT_GREY_ALPHA)
+            : (prof.colored ? LCT_RGB : LCT_GREY);
+
+        if (prof.key)
+        {
+            unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+            mode_out->key_r = prof.key_r & mask;
+            mode_out->key_g = prof.key_g & mask;
+            mode_out->key_b = prof.key_b & mask;
+            mode_out->key_defined = 1;
+        }
+    }
+
+    return error;
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c)
+{
+    short pa = abs(b - c);
+    short pb = abs(a - c);
+    short pc = abs(a + b - c - c);
+
+    if (pc < pa && pc < pb) return (unsigned char)c;
+    else if (pb < pa) return (unsigned char)b;
+    else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+                                                             /*
+                                                             Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+                                                             passw: output containing the width of the 7 passes
+                                                             passh: output containing the height of the 7 passes
+                                                             filter_passstart: output containing the index of the start and end of each
+                                                             reduced image with filter bytes
+                                                             padded_passstart output containing the index of the start and end of each
+                                                             reduced image when without filter bytes but with padded scanlines
+                                                             passstart: output containing the index of the start and end of each reduced
+                                                             image without padding between scanlines, but still padding between the images
+                                                             w, h: width and height of non-interlaced image
+                                                             bpp: bits per pixel
+                                                             "padded" is only relevant if bpp is less than 8 and a scanline or image does not
+                                                             end at a full byte
+                                                             */
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+    size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp)
+{
+    /*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+    unsigned i;
+
+    /*calculate width and height in pixels of each pass*/
+    for (i = 0; i != 7; ++i)
+    {
+        passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+        passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+        if (passw[i] == 0) passh[i] = 0;
+        if (passh[i] == 0) passw[i] = 0;
+    }
+
+    filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+    for (i = 0; i != 7; ++i)
+    {
+        /*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+        filter_passstart[i + 1] = filter_passstart[i]
+            + ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+        /*bits padded if needed to fill full byte at end of each scanline*/
+        padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+        /*only padded at end of reduced image*/
+        passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+    }
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    LodePNGInfo* info = &state->info_png;
+    if (insize == 0 || in == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+    }
+    if (insize < 33)
+    {
+        CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+    }
+
+    /*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+    lodepng_info_cleanup(info);
+    lodepng_info_init(info);
+
+    if (in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+        || in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10)
+    {
+        CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+    }
+    if (lodepng_chunk_length(in + 8) != 13)
+    {
+        CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+    }
+    if (!lodepng_chunk_type_equals(in + 8, "IHDR"))
+    {
+        CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+    }
+
+    /*read the values given in the header*/
+    *w = lodepng_read32bitInt(&in[16]);
+    *h = lodepng_read32bitInt(&in[20]);
+    info->color.bitdepth = in[24];
+    info->color.colortype = (LodePNGColorType)in[25];
+    info->compression_method = in[26];
+    info->filter_method = in[27];
+    info->interlace_method = in[28];
+
+    if (*w == 0 || *h == 0)
+    {
+        CERROR_RETURN_ERROR(state->error, 93);
+    }
+
+    if (!state->decoder.ignore_crc)
+    {
+        unsigned CRC = lodepng_read32bitInt(&in[29]);
+        unsigned checksum = lodepng_crc32(&in[12], 17);
+        if (CRC != checksum)
+        {
+            CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+        }
+    }
+
+    /*error: only compression method 0 is allowed in the specification*/
+    if (info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+    /*error: only filter method 0 is allowed in the specification*/
+    if (info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+    /*error: only interlace methods 0 and 1 exist in the specification*/
+    if (info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+    state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+    return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+    size_t bytewidth, unsigned char filterType, size_t length)
+{
+    /*
+    For PNG filter method 0
+    unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+    the filter works byte per byte (bytewidth = 1)
+    precon is the previous unfiltered scanline, recon the result, scanline the current one
+    the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+    recon and scanline MAY be the same memory address! precon must be disjoint.
+    */
+
+    size_t i;
+    switch (filterType)
+    {
+    case 0:
+        for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        break;
+    case 1:
+        for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+        break;
+    case 2:
+        if (precon)
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) recon[i] = scanline[i];
+        }
+        break;
+    case 3:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+        }
+        break;
+    case 4:
+        if (precon)
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i)
+            {
+                recon[i] = scanline[i];
+            }
+            for (i = bytewidth; i < length; ++i)
+            {
+                /*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+                recon[i] = (scanline[i] + recon[i - bytewidth]);
+            }
+        }
+        break;
+    default: return 36; /*error: unexisting filter type given*/
+    }
+    return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    /*
+    For PNG filter method 0
+    this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+    out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+    w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+    in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+    */
+
+    unsigned y;
+    unsigned char* prevline = 0;
+
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    size_t linebytes = (w * bpp + 7) / 8;
+
+    for (y = 0; y < h; ++y)
+    {
+        size_t outindex = linebytes * y;
+        size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+        unsigned char filterType = in[inindex];
+
+        CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+        prevline = &out[outindex];
+    }
+
+    return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        /*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+                        setBitOfReversedStream0(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*
+    After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+    to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+    for the Adam7 code, the color convert code and the output to the user.
+    in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+    have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+    also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+    only useful if (ilinebits - olinebits) is a value in the range 1..7
+    */
+    unsigned y;
+    size_t diff = ilinebits - olinebits;
+    size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+    for (y = 0; y < h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < olinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        ibp += diff;
+    }
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+    unsigned w, unsigned h, const LodePNGInfo* info_png)
+{
+    /*
+    This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+    Steps:
+    *) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+    *) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+    NOTE: the in buffer will be overwritten with intermediate data!
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    if (bpp == 0) return 31; /*error: invalid colortype*/
+
+    if (info_png->interlace_method == 0)
+    {
+        if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+        {
+            CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+            removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+        }
+        /*we can immediately filter into the out buffer, no other steps needed*/
+        else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned i;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        for (i = 0; i != 7; ++i)
+        {
+            CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+            /*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+            move bytes instead of bits or move not at all*/
+            if (bpp < 8)
+            {
+                /*remove padding bits in scanlines; after this there still may be padding
+                bits between the different reduced images: each reduced image still starts nicely at a byte*/
+                removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+                    ((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+            }
+        }
+
+        Adam7_deinterlace(out, in, w, h, bpp);
+    }
+
+    return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned pos = 0, i;
+    if (color->palette) lodepng_free(color->palette);
+    color->palettesize = chunkLength / 3;
+    color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+    if (!color->palette && color->palettesize)
+    {
+        color->palettesize = 0;
+        return 83; /*alloc fail*/
+    }
+    if (color->palettesize > 256) return 38; /*error: palette too big*/
+
+    for (i = 0; i != color->palettesize; ++i)
+    {
+        color->palette[4 * i + 0] = data[pos++]; /*R*/
+        color->palette[4 * i + 1] = data[pos++]; /*G*/
+        color->palette[4 * i + 2] = data[pos++]; /*B*/
+        color->palette[4 * i + 3] = 255; /*alpha*/
+    }
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+    unsigned i;
+    if (color->colortype == LCT_PALETTE)
+    {
+        /*error: more alpha values given than there are palette entries*/
+        if (chunkLength > color->palettesize) return 38;
+
+        for (i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+    }
+    else if (color->colortype == LCT_GREY)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 30;
+
+        color->key_defined = 1;
+        color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+    }
+    else if (color->colortype == LCT_RGB)
+    {
+        /*error: this chunk must be 6 bytes for RGB image*/
+        if (chunkLength != 6) return 41;
+
+        color->key_defined = 1;
+        color->key_r = 256u * data[0] + data[1];
+        color->key_g = 256u * data[2] + data[3];
+        color->key_b = 256u * data[4] + data[5];
+    }
+    else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+    return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (info->color.colortype == LCT_PALETTE)
+    {
+        /*error: this chunk must be 1 byte for indexed color image*/
+        if (chunkLength != 1) return 43;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = data[0];
+    }
+    else if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        /*error: this chunk must be 2 bytes for greyscale image*/
+        if (chunkLength != 2) return 44;
+
+        info->background_defined = 1;
+        info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        /*error: this chunk must be 6 bytes for greyscale image*/
+        if (chunkLength != 6) return 45;
+
+        info->background_defined = 1;
+        info->background_r = 256u * data[0] + data[1];
+        info->background_g = 256u * data[2] + data[3];
+        info->background_b = 256u * data[4] + data[5];
+    }
+
+    return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    char *key = 0, *str = 0;
+    unsigned i;
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        unsigned length, string2_begin;
+
+        length = 0;
+        while (length < chunkLength && data[length] != 0) ++length;
+        /*even though it's not allowed by the standard, no error is thrown if
+        there's no null termination char, if the text is empty*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        string2_begin = length + 1; /*skip keyword null terminator*/
+
+        length = chunkLength < string2_begin ? 0 : chunkLength - string2_begin;
+        str = (char*)lodepng_malloc(length + 1);
+        if (!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        str[length] = 0;
+        for (i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+        error = lodepng_add_text(info, key, str);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(str);
+
+    return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, string2_begin;
+    char *key = 0;
+    ucvector decoded;
+
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        if (data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+        string2_begin = length + 2;
+        if (string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+        length = chunkLength - string2_begin;
+        /*will fail if zlib error, e.g. if length is too small*/
+        error = zlib_decompress(&decoded.data, &decoded.size,
+            (unsigned char*)(&data[string2_begin]),
+            length, zlibsettings);
+        if (error) break;
+        ucvector_push_back(&decoded, 0);
+
+        error = lodepng_add_text(info, key, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+    const unsigned char* data, size_t chunkLength)
+{
+    unsigned error = 0;
+    unsigned i;
+
+    unsigned length, begin, compressed;
+    char *key = 0, *langtag = 0, *transkey = 0;
+    ucvector decoded;
+    ucvector_init(&decoded);
+
+    while (!error) /*not really a while loop, only used to break on error*/
+    {
+        /*Quick check if the chunk length isn't too small. Even without check
+        it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+        if (chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+                                                      /*read the key*/
+        for (length = 0; length < chunkLength && data[length] != 0; ++length);
+        if (length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+        if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+        key = (char*)lodepng_malloc(length + 1);
+        if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        key[length] = 0;
+        for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+        /*read the compression method*/
+        compressed = data[length + 1];
+        if (data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+                                                            /*even though it's not allowed by the standard, no error is thrown if
+                                                            there's no null termination char, if the text is empty for the next 3 texts*/
+
+                                                            /*read the langtag*/
+        begin = length + 3;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        langtag = (char*)lodepng_malloc(length + 1);
+        if (!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        langtag[length] = 0;
+        for (i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+        /*read the transkey*/
+        begin += length + 1;
+        length = 0;
+        for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+        transkey = (char*)lodepng_malloc(length + 1);
+        if (!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+        transkey[length] = 0;
+        for (i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+        /*read the actual text*/
+        begin += length + 1;
+
+        length = chunkLength < begin ? 0 : chunkLength - begin;
+
+        if (compressed)
+        {
+            /*will fail if zlib error, e.g. if length is too small*/
+            error = zlib_decompress(&decoded.data, &decoded.size,
+                (unsigned char*)(&data[begin]),
+                length, zlibsettings);
+            if (error) break;
+            if (decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+            ucvector_push_back(&decoded, 0);
+        }
+        else
+        {
+            if (!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+            decoded.data[length] = 0;
+            for (i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+        }
+
+        error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+        break;
+    }
+
+    lodepng_free(key);
+    lodepng_free(langtag);
+    lodepng_free(transkey);
+    ucvector_cleanup(&decoded);
+
+    return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+    info->time_defined = 1;
+    info->time.year = 256u * data[0] + data[1];
+    info->time.month = data[2];
+    info->time.day = data[3];
+    info->time.hour = data[4];
+    info->time.minute = data[5];
+    info->time.second = data[6];
+
+    return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+    if (chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+    info->phys_defined = 1;
+    info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+    info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+    info->phys_unit = data[8];
+
+    return 0; /* OK */
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    unsigned char IEND = 0;
+    const unsigned char* chunk;
+    size_t i;
+    ucvector idat; /*the data from idat chunks*/
+    ucvector scanlines;
+    size_t predict;
+    size_t numpixels;
+    size_t outsize = 0;
+
+    /*for unknown chunk order*/
+    unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                               /*provide some proper output values if error will happen*/
+    *out = 0;
+
+    state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+    if (state->error) return;
+
+    numpixels = *w * *h;
+
+    /*multiplication overflow*/
+    if (*h != 0 && numpixels / *h != *w) CERROR_RETURN(state->error, 92);
+    /*multiplication overflow possible further below. Allows up to 2^31-1 pixel
+    bytes with 16-bit RGBA, the rest is room for filter bytes.*/
+    if (numpixels > 268435455) CERROR_RETURN(state->error, 92);
+
+    ucvector_init(&idat);
+    chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+                     /*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+                     IDAT data is put at the start of the in buffer*/
+    while (!IEND && !state->error)
+    {
+        unsigned chunkLength;
+        const unsigned char* data; /*the data in the chunk*/
+
+                                   /*error: size of the in buffer too small to contain next chunk*/
+        if ((size_t)((chunk - in) + 12) > insize || chunk < in) CERROR_BREAK(state->error, 30);
+
+        /*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+        chunkLength = lodepng_chunk_length(chunk);
+        /*error: chunk length larger than the max PNG chunk size*/
+        if (chunkLength > 2147483647) CERROR_BREAK(state->error, 63);
+
+        if ((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in)
+        {
+            CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+        }
+
+        data = lodepng_chunk_data_const(chunk);
+
+        /*IDAT chunk, containing compressed image data*/
+        if (lodepng_chunk_type_equals(chunk, "IDAT"))
+        {
+            size_t oldsize = idat.size;
+            if (!ucvector_resize(&idat, oldsize + chunkLength)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+            for (i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*IEND chunk*/
+        else if (lodepng_chunk_type_equals(chunk, "IEND"))
+        {
+            IEND = 1;
+        }
+        /*palette chunk (PLTE)*/
+        else if (lodepng_chunk_type_equals(chunk, "PLTE"))
+        {
+            state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+        /*palette transparency chunk (tRNS)*/
+        else if (lodepng_chunk_type_equals(chunk, "tRNS"))
+        {
+            state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+            if (state->error) break;
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*background color chunk (bKGD)*/
+        else if (lodepng_chunk_type_equals(chunk, "bKGD"))
+        {
+            state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        /*text chunk (tEXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "tEXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*compressed text chunk (zTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "zTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        /*international text chunk (iTXt)*/
+        else if (lodepng_chunk_type_equals(chunk, "iTXt"))
+        {
+            if (state->decoder.read_text_chunks)
+            {
+                state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+                if (state->error) break;
+            }
+        }
+        else if (lodepng_chunk_type_equals(chunk, "tIME"))
+        {
+            state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+        else if (lodepng_chunk_type_equals(chunk, "pHYs"))
+        {
+            state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        else /*it's not an implemented chunk type, so ignore it: skip over the data*/
+        {
+            /*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+            if (!lodepng_chunk_ancillary(chunk)) CERROR_BREAK(state->error, 69);
+
+            unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+            if (state->decoder.remember_unknown_chunks)
+            {
+                state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+                    &state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+                if (state->error) break;
+            }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        }
+
+        if (!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/
+        {
+            if (lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+        }
+
+        if (!IEND) chunk = lodepng_chunk_next_const(chunk);
+    }
+
+    ucvector_init(&scanlines);
+    /*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+    If the decompressed size does not match the prediction, the image must be corrupt.*/
+    if (state->info_png.interlace_method == 0)
+    {
+        /*The extra *h is added because this are the filter bytes every scanline starts with*/
+        predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color) + *h;
+    }
+    else
+    {
+        /*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+        const LodePNGColorMode* color = &state->info_png.color;
+        predict = 0;
+        predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        if (*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+        predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color) + ((*h + 3) >> 3);
+        if (*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color) + ((*h + 3) >> 2);
+        predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color) + ((*h + 1) >> 2);
+        if (*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color) + ((*h + 1) >> 1);
+        predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color) + ((*h + 0) >> 1);
+    }
+    if (!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+    if (!state->error)
+    {
+        state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+            idat.size, &state->decoder.zlibsettings);
+        if (!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+    }
+    ucvector_cleanup(&idat);
+
+    if (!state->error)
+    {
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!*out) state->error = 83; /*alloc fail*/
+    }
+    if (!state->error)
+    {
+        for (i = 0; i < outsize; i++) (*out)[i] = 0;
+        state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+    }
+    ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize)
+{
+    *out = 0;
+    decodeGeneric(out, w, h, state, in, insize);
+    if (state->error) return state->error;
+    if (!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color))
+    {
+        /*same color type, no copying or converting of data needed*/
+        /*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+        the raw image has to the end user*/
+        if (!state->decoder.color_convert)
+        {
+            state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+            if (state->error) return state->error;
+        }
+    }
+    else
+    {
+        /*color conversion needed; sort of copy of the data*/
+        unsigned char* data = *out;
+        size_t outsize;
+
+        /*TODO: check if this works according to the statement in the documentation: "The converter can convert
+        from greyscale input color type, to 8-bit greyscale or greyscale with alpha"*/
+        if (!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+            && !(state->info_raw.bitdepth == 8))
+        {
+            return 56; /*unsupported color mode conversion*/
+        }
+
+        outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+        *out = (unsigned char*)lodepng_malloc(outsize);
+        if (!(*out))
+        {
+            state->error = 83; /*alloc fail*/
+        }
+        else state->error = lodepng_convert(*out, data, &state->info_raw,
+            &state->info_png.color, *w, *h);
+        lodepng_free(data);
+    }
+    return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+    size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    error = lodepng_decode(out, w, h, &state, in, insize);
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+    return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer = 0;
+    size_t buffersize;
+    unsigned error;
+    error = lodepng_load_file(&buffer, &buffersize, filename);
+    if (!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+    return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings)
+{
+    settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->read_text_chunks = 1;
+    settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    settings->ignore_crc = 0;
+    lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state)
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    lodepng_color_mode_init(&state->info_raw);
+    lodepng_info_init(&state->info_png);
+    state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state)
+{
+    lodepng_color_mode_cleanup(&state->info_raw);
+    lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source)
+{
+    lodepng_state_cleanup(dest);
+    *dest = *source;
+    lodepng_color_mode_init(&dest->info_raw);
+    lodepng_info_init(&dest->info_png);
+    dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if (dest->error) return;
+    dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if (dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length)
+{
+    CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+    out->allocsize = out->size; /*fix the allocsize again*/
+    return 0;
+}
+
+static void writeSignature(ucvector* out)
+{
+    /*8 bytes PNG signature, aka the magic bytes*/
+    ucvector_push_back(out, 137);
+    ucvector_push_back(out, 80);
+    ucvector_push_back(out, 78);
+    ucvector_push_back(out, 71);
+    ucvector_push_back(out, 13);
+    ucvector_push_back(out, 10);
+    ucvector_push_back(out, 26);
+    ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method)
+{
+    unsigned error = 0;
+    ucvector header;
+    ucvector_init(&header);
+
+    lodepng_add32bitInt(&header, w); /*width*/
+    lodepng_add32bitInt(&header, h); /*height*/
+    ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+    ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+    ucvector_push_back(&header, 0); /*compression method*/
+    ucvector_push_back(&header, 0); /*filter method*/
+    ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+    error = addChunk(out, "IHDR", header.data, header.size);
+    ucvector_cleanup(&header);
+
+    return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector PLTE;
+    ucvector_init(&PLTE);
+    for (i = 0; i != info->palettesize * 4; ++i)
+    {
+        /*add all channels except alpha channel*/
+        if (i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+    }
+    error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+    ucvector_cleanup(&PLTE);
+
+    return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector tRNS;
+    ucvector_init(&tRNS);
+    if (info->colortype == LCT_PALETTE)
+    {
+        size_t amount = info->palettesize;
+        /*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+        for (i = info->palettesize; i != 0; --i)
+        {
+            if (info->palette[4 * (i - 1) + 3] == 255) --amount;
+            else break;
+        }
+        /*add only alpha channel*/
+        for (i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+    }
+    else if (info->colortype == LCT_GREY)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+        }
+    }
+    else if (info->colortype == LCT_RGB)
+    {
+        if (info->key_defined)
+        {
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+            ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+        }
+    }
+
+    error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+    ucvector_cleanup(&tRNS);
+
+    return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+    LodePNGCompressSettings* zlibsettings)
+{
+    ucvector zlibdata;
+    unsigned error = 0;
+
+    /*compress with the Zlib compressor*/
+    ucvector_init(&zlibdata);
+    error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+    if (!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+    ucvector_cleanup(&zlibdata);
+
+    return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out)
+{
+    unsigned error = 0;
+    error = addChunk(out, "IEND", 0, 0);
+    return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring)
+{
+    unsigned error = 0;
+    size_t i;
+    ucvector text;
+    ucvector_init(&text);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&text, 0); /*0 termination char*/
+    for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+    error = addChunk(out, "tEXt", text.data, text.size);
+    ucvector_cleanup(&text);
+
+    return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+    LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data, compressed;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+    ucvector_init(&compressed);
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*0 termination char*/
+    ucvector_push_back(&data, 0); /*compression method: 0*/
+
+    error = zlib_compress(&compressed.data, &compressed.size,
+        (unsigned char*)textstring, textsize, zlibsettings);
+    if (!error)
+    {
+        for (i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+        error = addChunk(out, "zTXt", data.data, data.size);
+    }
+
+    ucvector_cleanup(&compressed);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+    const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings)
+{
+    unsigned error = 0;
+    ucvector data;
+    size_t i, textsize = strlen(textstring);
+
+    ucvector_init(&data);
+
+    for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+    if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+    ucvector_push_back(&data, 0); /*null termination char*/
+    ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+    ucvector_push_back(&data, 0); /*compression method*/
+    for (i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+    for (i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+    ucvector_push_back(&data, 0); /*null termination char*/
+
+    if (compressed)
+    {
+        ucvector compressed_data;
+        ucvector_init(&compressed_data);
+        error = zlib_compress(&compressed_data.data, &compressed_data.size,
+            (unsigned char*)textstring, textsize, zlibsettings);
+        if (!error)
+        {
+            for (i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+        }
+        ucvector_cleanup(&compressed_data);
+    }
+    else /*not compressed*/
+    {
+        for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+    }
+
+    if (!error) error = addChunk(out, "iTXt", data.data, data.size);
+    ucvector_cleanup(&data);
+    return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector bKGD;
+    ucvector_init(&bKGD);
+    if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+    }
+    else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+    }
+    else if (info->color.colortype == LCT_PALETTE)
+    {
+        ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+    }
+
+    error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+    ucvector_cleanup(&bKGD);
+
+    return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time)
+{
+    unsigned error = 0;
+    unsigned char* data = (unsigned char*)lodepng_malloc(7);
+    if (!data) return 83; /*alloc fail*/
+    data[0] = (unsigned char)(time->year >> 8);
+    data[1] = (unsigned char)(time->year & 255);
+    data[2] = (unsigned char)time->month;
+    data[3] = (unsigned char)time->day;
+    data[4] = (unsigned char)time->hour;
+    data[5] = (unsigned char)time->minute;
+    data[6] = (unsigned char)time->second;
+    error = addChunk(out, "tIME", data, 7);
+    lodepng_free(data);
+    return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info)
+{
+    unsigned error = 0;
+    ucvector data;
+    ucvector_init(&data);
+
+    lodepng_add32bitInt(&data, info->phys_x);
+    lodepng_add32bitInt(&data, info->phys_y);
+    ucvector_push_back(&data, info->phys_unit);
+
+    error = addChunk(out, "pHYs", data.data, data.size);
+    ucvector_cleanup(&data);
+
+    return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+    size_t length, size_t bytewidth, unsigned char filterType)
+{
+    size_t i;
+    switch (filterType)
+    {
+    case 0: /*None*/
+        for (i = 0; i != length; ++i) out[i] = scanline[i];
+        break;
+    case 1: /*Sub*/
+        for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+        break;
+    case 2: /*Up*/
+        if (prevline)
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+        }
+        else
+        {
+            for (i = 0; i != length; ++i) out[i] = scanline[i];
+        }
+        break;
+    case 3: /*Average*/
+        if (prevline)
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+        }
+        break;
+    case 4: /*Paeth*/
+        if (prevline)
+        {
+            /*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+            for (i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+            for (i = bytewidth; i < length; ++i)
+            {
+                out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+            }
+        }
+        else
+        {
+            for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+            /*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+            for (i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+        }
+        break;
+    default: return; /*unexisting filter type given*/
+    }
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f)
+{
+    float result = 0;
+    while (f > 32) { result += 4; f /= 16; }
+    while (f > 2) { ++result; f /= 2; }
+    return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+    const LodePNGColorMode* info, const LodePNGEncoderSettings* settings)
+{
+    /*
+    For PNG filter method 0
+    out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+    the scanlines with 1 extra byte per scanline
+    */
+
+    unsigned bpp = lodepng_get_bpp(info);
+    /*the width of a scanline in bytes, not including the filter type*/
+    size_t linebytes = (w * bpp + 7) / 8;
+    /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+    size_t bytewidth = (bpp + 7) / 8;
+    const unsigned char* prevline = 0;
+    unsigned x, y;
+    unsigned error = 0;
+    LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+    /*
+    There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+    *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+    use fixed filtering, with the filter None).
+    * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+    not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+    all five filters and select the filter that produces the smallest sum of absolute values per row.
+    This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+    If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+    but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+    heuristic is used.
+    */
+    if (settings->filter_palette_zero &&
+        (info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+    if (bpp == 0) return 31; /*error: invalid color type*/
+
+    if (strategy == LFS_ZERO)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            out[outindex] = 0; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_MINSUM)
+    {
+        /*adaptive filtering*/
+        size_t sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned char type, bestType = 0;
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        if (!error)
+        {
+            for (y = 0; y != h; ++y)
+            {
+                /*try the 5 filter types*/
+                for (type = 0; type != 5; ++type)
+                {
+                    filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+                    /*calculate the sum of the result*/
+                    sum[type] = 0;
+                    if (type == 0)
+                    {
+                        for (x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+                    }
+                    else
+                    {
+                        for (x = 0; x != linebytes; ++x)
+                        {
+                            /*For differences, each byte should be treated as signed, values above 127 are negative
+                            (converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+                            This means filtertype 0 is almost never chosen, but that is justified.*/
+                            unsigned char s = attempt[type][x];
+                            sum[type] += s < 128 ? s : (255U - s);
+                        }
+                    }
+
+                    /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                    if (type == 0 || sum[type] < smallest)
+                    {
+                        bestType = type;
+                        smallest = sum[type];
+                    }
+                }
+
+                prevline = &in[y * linebytes];
+
+                /*now fill the out values*/
+                out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+                for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+            }
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_ENTROPY)
+    {
+        float sum[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        float smallest = 0;
+        unsigned type, bestType = 0;
+        unsigned count[256];
+
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+
+        for (y = 0; y != h; ++y)
+        {
+            /*try the 5 filter types*/
+            for (type = 0; type != 5; ++type)
+            {
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                for (x = 0; x != 256; ++x) count[x] = 0;
+                for (x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+                ++count[type]; /*the filter type itself is part of the scanline*/
+                sum[type] = 0;
+                for (x = 0; x != 256; ++x)
+                {
+                    float p = count[x] / (float)(linebytes + 1);
+                    sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+                }
+                /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || sum[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = sum[type];
+                }
+            }
+
+            prevline = &in[y * linebytes];
+
+            /*now fill the out values*/
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else if (strategy == LFS_PREDEFINED)
+    {
+        for (y = 0; y != h; ++y)
+        {
+            size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+            size_t inindex = linebytes * y;
+            unsigned char type = settings->predefined_filters[y];
+            out[outindex] = type; /*filter type byte*/
+            filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+            prevline = &in[inindex];
+        }
+    }
+    else if (strategy == LFS_BRUTE_FORCE)
+    {
+        /*brute force filter chooser.
+        deflate the scanline after every filter attempt to see which one deflates best.
+        This is very slow and gives only slightly smaller, sometimes even larger, result*/
+        size_t size[5];
+        unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+        size_t smallest = 0;
+        unsigned type = 0, bestType = 0;
+        unsigned char* dummy;
+        LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+        /*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+        to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+        better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+        cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+        zlibsettings.btype = 1;
+        /*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+        images only, so disable it*/
+        zlibsettings.custom_zlib = 0;
+        zlibsettings.custom_deflate = 0;
+        for (type = 0; type != 5; ++type)
+        {
+            attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+            if (!attempt[type]) return 83; /*alloc fail*/
+        }
+        for (y = 0; y != h; ++y) /*try the 5 filter types*/
+        {
+            for (type = 0; type != 5; ++type)
+            {
+                unsigned testsize = linebytes;
+                /*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+                filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+                size[type] = 0;
+                dummy = 0;
+                zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+                lodepng_free(dummy);
+                /*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+                if (type == 0 || size[type] < smallest)
+                {
+                    bestType = type;
+                    smallest = size[type];
+                }
+            }
+            prevline = &in[y * linebytes];
+            out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+            for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+        }
+        for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+    }
+    else return 88; /* unknown filter strategy */
+
+    return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+    size_t olinebits, size_t ilinebits, unsigned h)
+{
+    /*The opposite of the removePaddingBits function
+    olinebits must be >= ilinebits*/
+    unsigned y;
+    size_t diff = olinebits - ilinebits;
+    size_t obp = 0, ibp = 0; /*bit pointers*/
+    for (y = 0; y != h; ++y)
+    {
+        size_t x;
+        for (x = 0; x < ilinebits; ++x)
+        {
+            unsigned char bit = readBitFromReversedStream(&ibp, in);
+            setBitOfReversedStream(&obp, out, bit);
+        }
+        /*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+        "Use of uninitialised value of size ###" warning from valgrind*/
+        for (x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+    }
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+no padding bits between scanlines, but between reduced images so that each
+reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    if (bpp >= 8)
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            size_t bytewidth = bpp / 8;
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+                    size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+                    for (b = 0; b < bytewidth; ++b)
+                    {
+                        out[pixeloutstart + b] = in[pixelinstart + b];
+                    }
+                }
+        }
+    }
+    else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+    {
+        for (i = 0; i != 7; ++i)
+        {
+            unsigned x, y, b;
+            unsigned ilinebits = bpp * passw[i];
+            unsigned olinebits = bpp * w;
+            size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+            for (y = 0; y < passh[i]; ++y)
+                for (x = 0; x < passw[i]; ++x)
+                {
+                    ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+                    obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+                    for (b = 0; b < bpp; ++b)
+                    {
+                        unsigned char bit = readBitFromReversedStream(&ibp, in);
+                        setBitOfReversedStream(&obp, out, bit);
+                    }
+                }
+        }
+    }
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+    unsigned w, unsigned h,
+    const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings)
+{
+    /*
+    This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+    *) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+    *) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+    */
+    unsigned bpp = lodepng_get_bpp(&info_png->color);
+    unsigned error = 0;
+
+    if (info_png->interlace_method == 0)
+    {
+        *outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            /*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+            if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+            {
+                unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+                if (!padded) error = 83; /*alloc fail*/
+                if (!error)
+                {
+                    addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+                    error = filter(*out, padded, w, h, &info_png->color, settings);
+                }
+                lodepng_free(padded);
+            }
+            else
+            {
+                /*we can immediately filter into the out buffer, no other steps needed*/
+                error = filter(*out, in, w, h, &info_png->color, settings);
+            }
+        }
+    }
+    else /*interlace_method is 1 (Adam7)*/
+    {
+        unsigned passw[7], passh[7];
+        size_t filter_passstart[8], padded_passstart[8], passstart[8];
+        unsigned char* adam7;
+
+        Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+        *outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+        *out = (unsigned char*)lodepng_malloc(*outsize);
+        if (!(*out)) error = 83; /*alloc fail*/
+
+        adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+        if (!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+        if (!error)
+        {
+            unsigned i;
+
+            Adam7_interlace(adam7, in, w, h, bpp);
+            for (i = 0; i != 7; ++i)
+            {
+                if (bpp < 8)
+                {
+                    unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+                    if (!padded) ERROR_BREAK(83); /*alloc fail*/
+                    addPaddingBits(padded, &adam7[passstart[i]],
+                        ((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+                    error = filter(&(*out)[filter_passstart[i]], padded,
+                        passw[i], passh[i], &info_png->color, settings);
+                    lodepng_free(padded);
+                }
+                else
+                {
+                    error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+                        passw[i], passh[i], &info_png->color, settings);
+                }
+
+                if (error) break;
+            }
+        }
+
+        lodepng_free(adam7);
+    }
+
+    return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize)
+{
+    size_t i;
+    unsigned key = 0;
+    unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+    for (i = 0; i != palettesize; ++i)
+    {
+        if (!key && palette[4 * i + 3] == 0)
+        {
+            r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+            key = 1;
+            i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+        }
+        else if (palette[4 * i + 3] != 255) return 2;
+        /*when key, no opaque RGB may have key's RGB*/
+        else if (key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+    }
+    return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize)
+{
+    unsigned char* inchunk = data;
+    while ((size_t)(inchunk - data) < datasize)
+    {
+        CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+        out->allocsize = out->size; /*fix the allocsize again*/
+        inchunk = lodepng_chunk_next(inchunk);
+    }
+    return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state)
+{
+    LodePNGInfo info;
+    ucvector outv;
+    unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+    size_t datasize = 0;
+
+    /*provide some proper output values if error will happen*/
+    *out = 0;
+    *outsize = 0;
+    state->error = 0;
+
+    /*check input values validity*/
+    if ((state->info_png.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+        && (state->info_png.color.palettesize == 0 || state->info_png.color.palettesize > 256))
+    {
+        CERROR_RETURN_ERROR(state->error, 68); /*invalid palette size, it is only allowed to be 1-256*/
+    }
+    if (state->encoder.zlibsettings.btype > 2)
+    {
+        CERROR_RETURN_ERROR(state->error, 61); /*error: unexisting btype*/
+    }
+    if (state->info_png.interlace_method > 1)
+    {
+        CERROR_RETURN_ERROR(state->error, 71); /*error: unexisting interlace mode*/
+    }
+    state->error = checkColorValidity(state->info_png.color.colortype, state->info_png.color.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+    state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+    if (state->error) return state->error; /*error: unexisting color type given*/
+
+                                           /* color convert and compute scanline filter types */
+    lodepng_info_init(&info);
+    lodepng_info_copy(&info, &state->info_png);
+    if (state->encoder.auto_convert)
+    {
+        state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+    }
+    if (!state->error)
+    {
+        if (!lodepng_color_mode_equal(&state->info_raw, &info.color))
+        {
+            unsigned char* converted;
+            size_t size = (w * h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+            converted = (unsigned char*)lodepng_malloc(size);
+            if (!converted && size) state->error = 83; /*alloc fail*/
+            if (!state->error)
+            {
+                state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+            }
+            if (!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+            lodepng_free(converted);
+        }
+        else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+    }
+
+    /* output all PNG chunks */
+    ucvector_init(&outv);
+    while (!state->error) /*while only executed once, to break on error*/
+    {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*write signature and chunks*/
+        writeSignature(&outv);
+        /*IHDR*/
+        addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*unknown chunks between IHDR and PLTE*/
+        if (info.unknown_chunks_data[0])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*PLTE*/
+        if (info.color.colortype == LCT_PALETTE)
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        if (state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA))
+        {
+            addChunk_PLTE(&outv, &info.color);
+        }
+        /*tRNS*/
+        if (info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+        if ((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined)
+        {
+            addChunk_tRNS(&outv, &info.color);
+        }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*bKGD (must come between PLTE and the IDAt chunks*/
+        if (info.background_defined) addChunk_bKGD(&outv, &info);
+        /*pHYs (must come before the IDAT chunks)*/
+        if (info.phys_defined) addChunk_pHYs(&outv, &info);
+
+        /*unknown chunks between PLTE and IDAT*/
+        if (info.unknown_chunks_data[1])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        /*IDAT (multiple IDAT chunks must be consecutive)*/
+        state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+        if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+        /*tIME*/
+        if (info.time_defined) addChunk_tIME(&outv, &info.time);
+        /*tEXt and/or zTXt*/
+        for (i = 0; i != info.text_num; ++i)
+        {
+            if (strlen(info.text_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.text_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            if (state->encoder.text_compression)
+            {
+                addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+            }
+            else
+            {
+                addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+            }
+        }
+        /*LodePNG version id in text chunk*/
+        if (state->encoder.add_id)
+        {
+            unsigned alread_added_id_text = 0;
+            for (i = 0; i != info.text_num; ++i)
+            {
+                if (!strcmp(info.text_keys[i], "LodePNG"))
+                {
+                    alread_added_id_text = 1;
+                    break;
+                }
+            }
+            if (alread_added_id_text == 0)
+            {
+                addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+            }
+        }
+        /*iTXt*/
+        for (i = 0; i != info.itext_num; ++i)
+        {
+            if (strlen(info.itext_keys[i]) > 79)
+            {
+                state->error = 66; /*text chunk too large*/
+                break;
+            }
+            if (strlen(info.itext_keys[i]) < 1)
+            {
+                state->error = 67; /*text chunk too small*/
+                break;
+            }
+            addChunk_iTXt(&outv, state->encoder.text_compression,
+                info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+                &state->encoder.zlibsettings);
+        }
+
+        /*unknown chunks between IDAT and IEND*/
+        if (info.unknown_chunks_data[2])
+        {
+            state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+            if (state->error) break;
+        }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+        addChunk_IEND(&outv);
+
+        break; /*this isn't really a while loop; no error happened so break out now!*/
+    }
+
+    lodepng_info_cleanup(&info);
+    lodepng_free(data);
+    /*instead of cleaning the vector up, give it to the output*/
+    *out = outv.data;
+    *outsize = outv.size;
+
+    return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+    unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned error;
+    LodePNGState state;
+    lodepng_state_init(&state);
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    state.info_png.color.colortype = colortype;
+    state.info_png.color.bitdepth = bitdepth;
+    lodepng_encode(out, outsize, image, w, h, &state);
+    error = state.error;
+    lodepng_state_cleanup(&state);
+    return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth)
+{
+    unsigned char* buffer;
+    size_t buffersize;
+    unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+    if (!error) error = lodepng_save_file(buffer, buffersize, filename);
+    lodepng_free(buffer);
+    return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+    return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings)
+{
+    lodepng_compress_settings_init(&settings->zlibsettings);
+    settings->filter_palette_zero = 1;
+    settings->filter_strategy = LFS_MINSUM;
+    settings->auto_convert = 1;
+    settings->force_palette = 0;
+    settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    settings->add_id = 0;
+    settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code)
+{
+    switch (code)
+    {
+    case 0: return "no error, everything went ok";
+    case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+    case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+    case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+    case 13: return "problem while processing dynamic deflate block";
+    case 14: return "problem while processing dynamic deflate block";
+    case 15: return "problem while processing dynamic deflate block";
+    case 16: return "unexisting code while processing dynamic deflate block";
+    case 17: return "end of out buffer memory reached while inflating";
+    case 18: return "invalid distance code while inflating";
+    case 19: return "end of out buffer memory reached while inflating";
+    case 20: return "invalid deflate block BTYPE encountered while decoding";
+    case 21: return "NLEN is not ones complement of LEN in a deflate block";
+        /*end of out buffer memory reached while inflating:
+        This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+        all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+        happen in a normal, well encoded, PNG image.*/
+    case 22: return "end of out buffer memory reached while inflating";
+    case 23: return "end of in buffer memory reached while inflating";
+    case 24: return "invalid FCHECK in zlib header";
+    case 25: return "invalid compression method in zlib header";
+    case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+    case 27: return "PNG file is smaller than a PNG header";
+        /*Checks the magic file header, the first 8 bytes of the PNG file*/
+    case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+    case 29: return "first chunk is not the header chunk";
+    case 30: return "chunk length too large, chunk broken off at end of file";
+    case 31: return "illegal PNG color type or bpp";
+    case 32: return "illegal PNG compression method";
+    case 33: return "illegal PNG filter method";
+    case 34: return "illegal PNG interlace method";
+    case 35: return "chunk length of a chunk is too large or the chunk too small";
+    case 36: return "illegal PNG filter type encountered";
+    case 37: return "illegal bit depth for this color type given";
+    case 38: return "the palette is too big"; /*more than 256 colors*/
+    case 39: return "more palette alpha values given in tRNS chunk than there are colors in the palette";
+    case 40: return "tRNS chunk has wrong size for greyscale image";
+    case 41: return "tRNS chunk has wrong size for RGB image";
+    case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+    case 43: return "bKGD chunk has wrong size for palette image";
+    case 44: return "bKGD chunk has wrong size for greyscale image";
+    case 45: return "bKGD chunk has wrong size for RGB image";
+    case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+    case 49: return "jumped past memory while generating dynamic huffman tree";
+    case 50: return "jumped past memory while generating dynamic huffman tree";
+    case 51: return "jumped past memory while inflating huffman block";
+    case 52: return "jumped past memory while inflating";
+    case 53: return "size of zlib data too small";
+    case 54: return "repeat symbol in tree while there was no value symbol yet";
+        /*jumped past tree while generating huffman tree, this could be when the
+        tree will have more leaves than symbols after generating it out of the
+        given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+    case 55: return "jumped past tree while generating huffman tree";
+    case 56: return "given output image colortype or bitdepth not supported for color conversion";
+    case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+    case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+    case 59: return "requested color conversion not supported";
+    case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+    case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+        /*LodePNG leaves the choice of RGB to greyscale conversion formula to the user.*/
+    case 62: return "conversion from color to greyscale not supported";
+    case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk"; /*(2^31-1)*/
+                                                                                                     /*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+    case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+    case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+    case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+    case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+    case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+    case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+    case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+    case 73: return "invalid tIME chunk size";
+    case 74: return "invalid pHYs chunk size";
+        /*length could be wrong, or data chopped off*/
+    case 75: return "no null termination char found while decoding text chunk";
+    case 76: return "iTXt chunk too short to contain required bytes";
+    case 77: return "integer overflow in buffer size";
+    case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+    case 79: return "failed to open file for writing";
+    case 80: return "tried creating a tree of 0 symbols";
+    case 81: return "lazy matching at pos 0 is impossible";
+    case 82: return "color conversion to palette requested while a color isn't in palette";
+    case 83: return "memory allocation failed";
+    case 84: return "given image too small to contain all pixels to be encoded";
+    case 86: return "impossible offset in lz77 encoding (internal bug)";
+    case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+    case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+    case 89: return "text chunk keyword too short or long: must have size 1-79";
+        /*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+    case 90: return "windowsize must be a power of two";
+    case 91: return "invalid decompressed idat size";
+    case 92: return "too many pixels, not supported";
+    case 93: return "zero width or height is invalid";
+    case 94: return "header chunk must have a size of 13 bytes";
+    }
+    return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        long size = lodepng_filesize(filename.c_str());
+        if (size < 0) return 78;
+        buffer.resize((size_t)size);
+        return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+    }
+
+    /*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename)
+    {
+        return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings)
+    {
+        return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings)
+    {
+        unsigned char* buffer = 0;
+        size_t buffersize = 0;
+        unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings)
+    {
+        return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+    }
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+    State::State()
+    {
+        lodepng_state_init(this);
+    }
+
+    State::State(const State& other)
+    {
+        lodepng_state_init(this);
+        lodepng_state_copy(this, &other);
+    }
+
+    State::~State()
+    {
+        lodepng_state_cleanup(this);
+    }
+
+    State& State::operator=(const State& other)
+    {
+        lodepng_state_copy(this, &other);
+        return *this;
+    }
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+        size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+        if (buffer && !error)
+        {
+            State state;
+            state.info_raw.colortype = colortype;
+            state.info_raw.bitdepth = bitdepth;
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth)
+    {
+        return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize)
+    {
+        unsigned char* buffer = NULL;
+        unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+        if (buffer && !error)
+        {
+            size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+        }
+        lodepng_free(buffer);
+        return error;
+    }
+
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in)
+    {
+        return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = load_file(buffer, filename);
+        if (error) return error;
+        return decode(out, w, h, buffer, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state)
+    {
+        unsigned char* buffer;
+        size_t buffersize;
+        unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+        if (buffer)
+        {
+            out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+            lodepng_free(buffer);
+        }
+        return error;
+    }
+
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state)
+    {
+        if (lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+        return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+    }
+
+#ifdef LODEPNG_COMPILE_DISK
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        std::vector<unsigned char> buffer;
+        unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+        if (!error) error = save_file(buffer, filename);
+        return error;
+    }
+
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype, unsigned bitdepth)
+    {
+        if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+        return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+    }
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.h b/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.h
new file mode 100644
index 000000000..595312ca8
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/lodepng/lodepng.h
@@ -0,0 +1,1762 @@
+/*
+LodePNG version 20170917
+
+Copyright (c) 2005-2017 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType
+{
+    LCT_GREY = 0, /*greyscale: 1,2,4,8,16 bit*/
+    LCT_RGB = 2, /*RGB: 8,16 bit*/
+    LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+    LCT_GREY_ALPHA = 4, /*greyscale with alpha: 8,16 bit*/
+    LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+After decoding, its size is w * h * (bytes per pixel) bytes larger than
+initially. Bytes per pixel depends on colortype and bitdepth.
+Must be freed after usage with free(*out).
+Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+    const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+    const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+of the output PNG image cannot be chosen, they are automatically determined
+by the colortype, bitdepth and content of the input pixel data.
+Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+    const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    /*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+    is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const unsigned char* in, size_t insize,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::vector<unsigned char>& in,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts PNG file from disk to raw pixel data in memory.
+    Same as the other decode functions, but instead takes a filename as input.
+    */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        const std::string& filename,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+    is that of the raw input data. The output PNG color type will be auto chosen.*/
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+    Same as the other encode functions, but instead takes a filename as output.
+    NOTE: This overwrites existing files without warning!
+    */
+    unsigned encode(const std::string& filename,
+        const unsigned char* in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+    unsigned encode(const std::string& filename,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+  /*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings
+{
+    unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+                             /*use custom zlib decoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+    /*use custom deflate decoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_inflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGDecompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/
+{
+    /*LZ77 related settings*/
+    unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+    unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+    unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+    unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+    unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+    unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+                           /*use custom zlib encoder instead of built in one (default: null)*/
+    unsigned(*custom_zlib)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+    /*use custom deflate encoder instead of built in one (default: null)
+    if custom_zlib is used, custom_deflate is ignored since only the built in
+    zlib function will call custom_deflate*/
+    unsigned(*custom_deflate)(unsigned char**, size_t*,
+        const unsigned char*, size_t,
+        const LodePNGCompressSettings*);
+
+    const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode
+{
+    /*header (IHDR)*/
+    LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+    unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+                        /*
+                        palette (PLTE and tRNS)
+
+                        Dynamically allocated with the colors of the palette, including alpha.
+                        When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+                        lodepng_palette_clear, then for each color use lodepng_palette_add.
+                        If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+                        When decoding, by default you can ignore this palette, since LodePNG already
+                        fills the palette colors in the pixels of the raw RGBA output.
+
+                        The palette is only supported for color type 3.
+                        */
+    unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+    size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+                        /*
+                        transparent color key (tRNS)
+
+                        This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+                        For greyscale PNGs, r, g and b will all 3 be set to the same.
+
+                        When decoding, by default you can ignore this information, since LodePNG sets
+                        pixels with this key to transparent already in the raw RGBA output.
+
+                        The color key is only supported for color types 0 and 2.
+                        */
+    unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+    unsigned key_r;       /*red/greyscale component of color key*/
+    unsigned key_g;       /*green component of color key*/
+    unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+    unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a greyscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime
+{
+    unsigned year;    /*2 bytes used (0-65535)*/
+    unsigned month;   /*1-12*/
+    unsigned day;     /*1-31*/
+    unsigned hour;    /*0-23*/
+    unsigned minute;  /*0-59*/
+    unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo
+{
+    /*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+    unsigned compression_method;/*compression method of the original file. Always 0.*/
+    unsigned filter_method;     /*filter method of the original file*/
+    unsigned interlace_method;  /*interlace method of the original file*/
+    LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+                                /*
+                                suggested background color chunk (bKGD)
+                                This color uses the same color mode as the PNG (except alpha channel), which can be 1-bit to 16-bit.
+
+                                For greyscale PNGs, r, g and b will all 3 be set to the same. When encoding
+                                the encoder writes the red one. For palette PNGs: When decoding, the RGB value
+                                will be stored, not a palette index. But when encoding, specify the index of
+                                the palette in background_r, the other two are then ignored.
+
+                                The decoder does not use this background color to edit the color of pixels.
+                                */
+    unsigned background_defined; /*is a suggested background color given?*/
+    unsigned background_r;       /*red component of suggested background color*/
+    unsigned background_g;       /*green component of suggested background color*/
+    unsigned background_b;       /*blue component of suggested background color*/
+
+                                 /*
+                                 non-international text chunks (tEXt and zTXt)
+
+                                 The char** arrays each contain num strings. The actual messages are in
+                                 text_strings, while text_keys are keywords that give a short description what
+                                 the actual text represents, e.g. Title, Author, Description, or anything else.
+
+                                 A keyword is minimum 1 character and maximum 79 characters long. It's
+                                 discouraged to use a single line length longer than 79 characters for texts.
+
+                                 Don't allocate these text buffers yourself. Use the init/cleanup functions
+                                 correctly and use lodepng_add_text and lodepng_clear_text.
+                                 */
+    size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+    char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+    char** text_strings; /*the actual text*/
+
+                         /*
+                         international text chunks (iTXt)
+                         Similar to the non-international text chunks, but with additional strings
+                         "langtags" and "transkeys".
+                         */
+    size_t itext_num; /*the amount of international texts in this PNG*/
+    char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+    char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+    char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+    char** itext_strings; /*the actual international text - UTF-8 string*/
+
+                          /*time chunk (tIME)*/
+    unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+    LodePNGTime time;
+
+    /*phys chunk (pHYs)*/
+    unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+    unsigned phys_x; /*pixels per unit in x direction*/
+    unsigned phys_y; /*pixels per unit in y direction*/
+    unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+                        /*
+                        unknown chunks
+                        There are 3 buffers, one for each position in the PNG where unknown chunks can appear
+                        each buffer contains all unknown chunks for that position consecutively
+                        The 3 buffers are the unknown chunks between certain critical chunks:
+                        0: IHDR-PLTE, 1: PLTE-IDAT, 2: IDAT-IEND
+                        Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+                        later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+                        */
+    unsigned char* unknown_chunks_data[3];
+    size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+    const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+                                            /*
+                                            Converts raw buffer from one color type to another color type, based on
+                                            LodePNGColorMode structs to describe the input and output color type.
+                                            See the reference manual at the end of this header file to see which color conversions are supported.
+                                            return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+                                            The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+                                            of the output color type (lodepng_get_bpp).
+                                            For < 8 bpp images, there should not be padding bits at the end of scanlines.
+                                            For 16-bit per channel colors, uses big endian format like PNG does.
+                                            Return value is LodePNG error code
+                                            */
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+    unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings
+{
+    LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+    unsigned ignore_crc; /*ignore CRC checksums*/
+
+    unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+                               /*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+    unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy
+{
+    /*every filter at zero*/
+    LFS_ZERO,
+    /*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+    LFS_MINSUM,
+    /*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+    on the image, this is better or worse than minsum.*/
+    LFS_ENTROPY,
+    /*
+    Brute-force-search PNG filters by compressing each filter for each scanline.
+    Experimental, very slow, and only rarely gives better compression than MINSUM.
+    */
+    LFS_BRUTE_FORCE,
+    /*use predefined_filters buffer: you specify the filter type for each scanline*/
+    LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the colors of the image, which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.*/
+typedef struct LodePNGColorProfile
+{
+    unsigned colored; /*not greyscale*/
+    unsigned key; /*image is not opaque and color key is possible instead of full alpha*/
+    unsigned short key_r; /*key values, always as 16-bit, in 8-bit case the byte is duplicated, e.g. 65535 means 255*/
+    unsigned short key_g;
+    unsigned short key_b;
+    unsigned alpha; /*image is not opaque and alpha channel or alpha palette required*/
+    unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+    unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+    unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for greyscale only. 16 if 16-bit per channel required.*/
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. grey if only grey pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+    const unsigned char* image, unsigned w, unsigned h,
+    const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings
+{
+    LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+    unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+                           /*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+                           8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+                           completely follow the official PNG heuristic, filter_palette_zero must be true and
+                           filter_strategy must be LFS_MINSUM*/
+    unsigned filter_palette_zero;
+    /*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+    Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+    LodePNGFilterStrategy filter_strategy;
+    /*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+    the same length as the amount of scanlines in the image, and each value must <= 5. You
+    have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+    must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+    const unsigned char* predefined_filters;
+
+    /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+    If colortype is 3, PLTE is _always_ created.*/
+    unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*add LodePNG identifier and version as a text chunk, for debugging*/
+    unsigned add_id;
+    /*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+    unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState
+{
+#ifdef LODEPNG_COMPILE_DECODER
+    LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+    LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+    LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+    LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+    unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+    /* For the lodepng::State subclass. */
+    virtual ~LodePNGState() {}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the header chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+    LodePNGState* state,
+    const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+    const unsigned char* image, unsigned w, unsigned h,
+    LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+PNG standard chunk naming conventions:
+First byte: uppercase = critical, lowercase = ancillary
+Second byte: uppercase = public, lowercase = private
+Third byte: must be uppercase
+Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*iterate to next chunks. don't use on IEND chunk, as there is no next chunk then*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+    const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+    size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+    const unsigned char* in, size_t insize,
+    const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_PNG
+    class State : public LodePNGState
+    {
+    public:
+        State();
+        State(const State& other);
+        virtual ~State();
+        State& operator=(const State& other);
+    };
+
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Same as other lodepng::decode, but using a State for more settings and information. */
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const unsigned char* in, size_t insize);
+    unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+        State& state,
+        const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Same as other lodepng::encode, but using a State for more settings and information. */
+    unsigned encode(std::vector<unsigned char>& out,
+        const unsigned char* in, unsigned w, unsigned h,
+        State& state);
+    unsigned encode(std::vector<unsigned char>& out,
+        const std::vector<unsigned char>& in, unsigned w, unsigned h,
+        State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+    /*
+    Load a file from disk into an std::vector.
+    return value: error code (0 means ok)
+    */
+    unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+    /*
+    Save the binary data in an std::vector to a file on disk. The file is overwritten
+    without warning.
+    */
+    unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+    /* Zlib-decompress an unsigned char buffer */
+    unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+    /* Zlib-decompress an std::vector */
+    unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+    /* Zlib-compress an unsigned char buffer */
+    unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+    /* Zlib-compress an std::vector */
+    unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+        const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+  /*
+  TODO:
+  [.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+  [.] check compatibility with various compilers  - done but needs to be redone for every newer version
+  [X] converting color to 16-bit per channel types
+  [ ] read all public PNG chunk types (but never let the color profile and gamma ones touch RGB values)
+  [ ] make sure encoder generates no chunks with size > (2^31)-1
+  [ ] partial decoding (stream processing)
+  [X] let the "isFullyOpaque" function check color keys and transparent palettes too
+  [X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+  [ ] don't stop decoding on errors like 69, 57, 58 (make warnings)
+  [ ] make warnings like: oob palette, checksum fail, data after iend, wrong/unknown crit chunk, no null terminator in text, ...
+  [ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+  [ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+  [ ] allow user to give data (void*) to custom allocator
+  */
+
+#endif /*LODEPNG_H inclusion guard*/
+
+  /*
+  LodePNG Documentation
+  ---------------------
+
+  0. table of contents
+  --------------------
+
+  1. about
+  1.1. supported features
+  1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+  6.1. PNG color types
+  6.2. color conversions
+  6.3. padding bits
+  6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+  10.1. decoder C++ example
+  10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+  1. about
+  --------
+
+  PNG is a file format to store raster images losslessly with good compression,
+  supporting different color types and alpha channel.
+
+  LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+  Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+  The specifications used are:
+
+  *) Portable Network Graphics (PNG) Specification (Second Edition):
+  http://www.w3.org/TR/2003/REC-PNG-20031110
+  *) RFC 1950 ZLIB Compressed Data Format version 3.3:
+  http://www.gzip.org/zlib/rfc-zlib.html
+  *) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+  http://www.gzip.org/zlib/rfc-deflate.html
+
+  The most recent version of LodePNG can currently be found at
+  http://lodev.org/lodepng/
+
+  LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+  extra functionality.
+
+  LodePNG exists out of two files:
+  -lodepng.h: the header file for both C and C++
+  -lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+  If you want to start using LodePNG right away without reading this doc, get the
+  examples from the LodePNG website to see how to use it in code, or check the
+  smaller examples in chapter 13 here.
+
+  LodePNG is simple but only supports the basic requirements. To achieve
+  simplicity, the following design choices were made: There are no dependencies
+  on any external library. There are functions to decode and encode a PNG with
+  a single function call, and extended versions of these functions taking a
+  LodePNGState struct allowing to specify or get more information. By default
+  the colors of the raw image are always RGB or RGBA, no matter what color type
+  the PNG file uses. To read and write files, there are simple functions to
+  convert the files to/from buffers in memory.
+
+  This all makes LodePNG suitable for loading textures in games, demos and small
+  programs, ... It's less suitable for full fledged image editors, loading PNGs
+  over network (it requires all the image data to be available before decoding can
+  begin), life-critical systems, ...
+
+  1.1. supported features
+  -----------------------
+
+  The following features are supported by the decoder:
+
+  *) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+  or the same color type as the PNG
+  *) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+  *) Adam7 interlace and deinterlace for any color type
+  *) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+  *) support for alpha channels, including RGBA color model, translucent palettes and color keying
+  *) zlib decompression (inflate)
+  *) zlib compression (deflate)
+  *) CRC32 and ADLER32 checksums
+  *) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+  *) the following chunks are supported (generated/interpreted) by both encoder and decoder:
+  IHDR: header information
+  PLTE: color palette
+  IDAT: pixel data
+  IEND: the final chunk
+  tRNS: transparency for palettized images
+  tEXt: textual information
+  zTXt: compressed textual information
+  iTXt: international textual information
+  bKGD: suggested background color
+  pHYs: physical dimensions
+  tIME: modification time
+
+  1.2. features not supported
+  ---------------------------
+
+  The following features are _not_ supported:
+
+  *) some features needed to make a conformant PNG-Editor might be still missing.
+  *) partial loading/stream processing. All data must be available and is processed in one call.
+  *) The following public chunks are not supported but treated as unknown chunks by LodePNG
+  cHRM, gAMA, iCCP, sRGB, sBIT, hIST, sPLT
+  Some of these are not supported on purpose: LodePNG wants to provide the RGB values
+  stored in the pixels, not values modified by system dependent gamma or color models.
+
+
+  2. C and C++ version
+  --------------------
+
+  The C version uses buffers allocated with alloc that you need to free()
+  yourself. You need to use init and cleanup functions for each struct whenever
+  using a struct from the C version to avoid exploits and memory leaks.
+
+  The C++ version has extra functions with std::vectors in the interface and the
+  lodepng::State class which is a LodePNGState with constructor and destructor.
+
+  These files work without modification for both C and C++ compilers because all
+  the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+  ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+  To use the C++ version, you need to rename the source file to lodepng.cpp
+  (instead of lodepng.c), and compile it with a C++ compiler.
+
+  To use the C version, you need to rename the source file to lodepng.c (instead
+  of lodepng.cpp), and compile it with a C compiler.
+
+
+  3. Security
+  -----------
+
+  Even if carefully designed, it's always possible that LodePNG contains possible
+  exploits. If you discover one, please let me know, and it will be fixed.
+
+  When using LodePNG, care has to be taken with the C version of LodePNG, as well
+  as the C-style structs when working with C++. The following conventions are used
+  for all C-style structs:
+
+  -if a struct has a corresponding init function, always call the init function when making a new one
+  -if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+  -if a struct has a corresponding copy function, use the copy function instead of "=".
+  The destination must also be inited already.
+
+
+  4. Decoding
+  -----------
+
+  Decoding converts a PNG compressed image to a raw pixel buffer.
+
+  Most documentation on using the decoder is at its declarations in the header
+  above. For C, simple decoding can be done with functions such as
+  lodepng_decode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+  various lodepng::decode functions, and lodepng::State can be used for advanced
+  features.
+
+  When using the LodePNGState, it uses the following fields for decoding:
+  *) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+  *) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+  *) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  After decoding, this contains extra information of the PNG image, except the actual
+  pixels, width and height because these are already gotten directly from the decoder
+  functions.
+
+  It contains for example the original color type of the PNG image, text comments,
+  suggested background color, etc... More details about the LodePNGInfo struct are
+  at its declaration documentation.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  When decoding, here you can specify which color type you want
+  the resulting raw image to be. If this is different from the colortype of the
+  PNG, then the decoder will automatically convert the result. This conversion
+  always works, except if you want it to convert a color PNG to greyscale or to
+  a palette with missing colors.
+
+  By default, 32-bit color is used for the result.
+
+  LodePNGDecoderSettings decoder
+  ------------------------------
+
+  The settings can be used to ignore the errors created by invalid CRC and Adler32
+  chunks, and to disable the decoding of tEXt chunks.
+
+  There's also a setting color_convert, true by default. If false, no conversion
+  is done, the resulting data will be as it was in the PNG (after decompression)
+  and you'll have to puzzle the colors of the pixels together yourself using the
+  color type information in the LodePNGInfo.
+
+
+  5. Encoding
+  -----------
+
+  Encoding converts a raw pixel buffer to a PNG compressed image.
+
+  Most documentation on using the encoder is at its declarations in the header
+  above. For C, simple encoding can be done with functions such as
+  lodepng_encode32, and more advanced decoding can be done with the struct
+  LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+  various lodepng::encode functions, and lodepng::State can be used for advanced
+  features.
+
+  Like the decoder, the encoder can also give errors. However it gives less errors
+  since the encoder input is trusted, the decoder input (a PNG image that could
+  be forged by anyone) is not trusted.
+
+  When using the LodePNGState, it uses the following fields for encoding:
+  *) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+  *) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+  *) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+  LodePNGInfo info_png
+  --------------------
+
+  When encoding, you use this the opposite way as when decoding: for encoding,
+  you fill in the values you want the PNG to have before encoding. By default it's
+  not needed to specify a color type for the PNG since it's automatically chosen,
+  but it's possible to choose it yourself given the right settings.
+
+  The encoder will not always exactly match the LodePNGInfo struct you give,
+  it tries as close as possible. Some things are ignored by the encoder. The
+  encoder uses, for example, the following settings from it when applicable:
+  colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+  background color, the interlace method, unknown chunks, ...
+
+  When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+  If the palette contains any colors for which the alpha channel is not 255 (so
+  there are translucent colors in the palette), it'll add a tRNS chunk.
+
+  LodePNGColorMode info_raw
+  -------------------------
+
+  You specify the color type of the raw image that you give to the input here,
+  including a possible transparent color key and palette you happen to be using in
+  your raw image data.
+
+  By default, 32-bit color is assumed, meaning your input has to be in RGBA
+  format with 4 bytes (unsigned chars) per pixel.
+
+  LodePNGEncoderSettings encoder
+  ------------------------------
+
+  The following settings are supported (some are in sub-structs):
+  *) auto_convert: when this option is enabled, the encoder will
+  automatically choose the smallest possible color mode (including color key) that
+  can encode the colors of all pixels without information loss.
+  *) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+  2 = dynamic huffman tree (best compression). Should be 2 for proper
+  compression.
+  *) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+  true for proper compression.
+  *) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+  2048 by default, but can be set to 32768 for better, but slow, compression.
+  *) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+  chunk if force_palette is true. This can used as suggested palette to convert
+  to by viewers that don't support more than 256 colors (if those still exist)
+  *) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+  *) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+  6. color conversions
+  --------------------
+
+  An important thing to note about LodePNG, is that the color type of the PNG, and
+  the color type of the raw image, are completely independent. By default, when
+  you decode a PNG, you get the result as a raw image in the color type you want,
+  no matter whether the PNG was encoded with a palette, greyscale or RGBA color.
+  And if you encode an image, by default LodePNG will automatically choose the PNG
+  color type that gives good compression based on the values of colors and amount
+  of colors in the image. It can be configured to let you control it instead as
+  well, though.
+
+  To be able to do this, LodePNG does conversions from one color mode to another.
+  It can convert from almost any color type to any other color type, except the
+  following conversions: RGB to greyscale is not supported, and converting to a
+  palette when the palette doesn't have a required color is not supported. This is
+  not supported on purpose: this is information loss which requires a color
+  reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to grey
+  is easy, but there are multiple ways if you want to give some channels more
+  weight).
+
+  By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+  color, no matter what color type the PNG has. And by default when encoding,
+  LodePNG automatically picks the best color model for the output PNG, and expects
+  the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+  the color format of the images yourself, you can skip this chapter.
+
+  6.1. PNG color types
+  --------------------
+
+  A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+  as well as palettized color modes. After the zlib decompression and unfiltering
+  in the PNG image is done, the raw pixel data will have that color type and thus
+  a certain amount of bits per pixel. If you want the output raw image after
+  decoding to have another color type, a conversion is done by LodePNG.
+
+  The PNG specification gives the following color types:
+
+  0: greyscale, bit depths 1, 2, 4, 8, 16
+  2: RGB, bit depths 8 and 16
+  3: palette, bit depths 1, 2, 4 and 8
+  4: greyscale with alpha, bit depths 8 and 16
+  6: RGBA, bit depths 8 and 16
+
+  Bit depth is the amount of bits per pixel per color channel. So the total amount
+  of bits per pixel is: amount of channels * bitdepth.
+
+  6.2. color conversions
+  ----------------------
+
+  As explained in the sections about the encoder and decoder, you can specify
+  color types and bit depths in info_png and info_raw to change the default
+  behaviour.
+
+  If, when decoding, you want the raw image to be something else than the default,
+  you need to set the color type and bit depth you want in the LodePNGColorMode,
+  or the parameters colortype and bitdepth of the simple decoding function.
+
+  If, when encoding, you use another color type than the default in the raw input
+  image, you need to specify its color type and bit depth in the LodePNGColorMode
+  of the raw image, or use the parameters colortype and bitdepth of the simple
+  encoding function.
+
+  If, when encoding, you don't want LodePNG to choose the output PNG color type
+  but control it yourself, you need to set auto_convert in the encoder settings
+  to false, and specify the color type you want in the LodePNGInfo of the
+  encoder (including palette: it can generate a palette if auto_convert is true,
+  otherwise not).
+
+  If the input and output color type differ (whether user chosen or auto chosen),
+  LodePNG will do a color conversion, which follows the rules below, and may
+  sometimes result in an error.
+
+  To avoid some confusion:
+  -the decoder converts from PNG to raw image
+  -the encoder converts from raw image to PNG
+  -the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+  -the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+  -when encoding, the color type in LodePNGInfo is ignored if auto_convert
+  is enabled, it is automatically generated instead
+  -when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+  PNG image, but it can be ignored since the raw image has the color type you requested instead
+  -if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+  between the color types is done if the color types are supported. If it is not
+  supported, an error is returned. If the types are the same, no conversion is done.
+  -even though some conversions aren't supported, LodePNG supports loading PNGs from any
+  colortype and saving PNGs to any colortype, sometimes it just requires preparing
+  the raw image correctly before encoding.
+  -both encoder and decoder use the same color converter.
+
+  Non supported color conversions:
+  -color to greyscale: no error is thrown, but the result will look ugly because
+  only the red channel is taken
+  -anything to palette when that palette does not have that color in it: in this
+  case an error is thrown
+
+  Supported color conversions:
+  -anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+  -any grey or grey+alpha, to grey or grey+alpha
+  -anything to a palette, as long as the palette has the requested colors in it
+  -removing alpha channel
+  -higher to smaller bitdepth, and vice versa
+
+  If you want no color conversion to be done (e.g. for speed or control):
+  -In the encoder, you can make it save a PNG with any color type by giving the
+  raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+  false.
+  -In the decoder, you can make it store the pixel data in the same color type
+  as the PNG has, by setting the color_convert setting to false. Settings in
+  info_raw are then ignored.
+
+  The function lodepng_convert does the color conversion. It is available in the
+  interface but normally isn't needed since the encoder and decoder already call
+  it.
+
+  6.3. padding bits
+  -----------------
+
+  In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+  have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+  scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+  The raw input image you give to the encoder, and the raw output image you get from the decoder
+  will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+  of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+  not the first bit of a new byte.
+
+  6.4. A note about 16-bits per channel and endianness
+  ----------------------------------------------------
+
+  LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+  for any other color format. The 16-bit values are stored in big endian (most
+  significant byte first) in these arrays. This is the opposite order of the
+  little endian used by x86 CPU's.
+
+  LodePNG always uses big endian because the PNG file format does so internally.
+  Conversions to other formats than PNG uses internally are not supported by
+  LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+  colors, the order in which you store R, G, B and A, and so on. Supporting and
+  converting to/from all that is outside the scope of LodePNG.
+
+  This may mean that, depending on your use case, you may want to convert the big
+  endian output of LodePNG to little endian with a for loop. This is certainly not
+  always needed, many applications and libraries support big endian 16-bit colors
+  anyway, but it means you cannot simply cast the unsigned char* buffer to an
+  unsigned short* buffer on x86 CPUs.
+
+
+  7. error values
+  ---------------
+
+  All functions in LodePNG that return an error code, return 0 if everything went
+  OK, or a non-zero code if there was an error.
+
+  The meaning of the LodePNG error values can be retrieved with the function
+  lodepng_error_text: given the numerical error code, it returns a description
+  of the error in English as a string.
+
+  Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+  8. chunks and PNG editing
+  -------------------------
+
+  If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+  editor that should follow the rules about handling of unknown chunks, or if your
+  program is able to read other types of chunks than the ones handled by LodePNG,
+  then that's possible with the chunk functions of LodePNG.
+
+  A PNG chunk has the following layout:
+
+  4 bytes length
+  4 bytes type name
+  length bytes data
+  4 bytes CRC
+
+  8.1. iterating through chunks
+  -----------------------------
+
+  If you have a buffer containing the PNG image data, then the first chunk (the
+  IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+  signature of the PNG and are not part of a chunk. But if you start at byte 8
+  then you have a chunk, and can check the following things of it.
+
+  NOTE: none of these functions check for memory buffer boundaries. To avoid
+  exploits, always make sure the buffer contains all the data of the chunks.
+  When using lodepng_chunk_next, make sure the returned value is within the
+  allocated memory.
+
+  unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+  Get the length of the chunk's data. The total chunk length is this length + 12.
+
+  void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+  unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+  Get the type of the chunk or compare if it's a certain type
+
+  unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+  unsigned char lodepng_chunk_private(const unsigned char* chunk):
+  unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+  Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+  Check if the chunk is private (public chunks are part of the standard, private ones not).
+  Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+  chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+  program doesn't handle that type of unknown chunk.
+
+  unsigned char* lodepng_chunk_data(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+  Get a pointer to the start of the data of the chunk.
+
+  unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+  void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+  Check if the crc is correct or generate a correct one.
+
+  unsigned char* lodepng_chunk_next(unsigned char* chunk):
+  const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+  Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+  functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+  data available in the buffer to be able to go to the next chunk.
+
+  unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+  unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+  const char* type, const unsigned char* data):
+
+  These functions are used to create new chunks that are appended to the data in *out that has
+  length *outlength. The append function appends an existing chunk to the new data. The create
+  function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+  name of the chunk.
+
+  8.2. chunks in info_png
+  -----------------------
+
+  The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+  buffers (each with size) to contain 3 types of unknown chunks:
+  the ones that come before the PLTE chunk, the ones that come between the PLTE
+  and the IDAT chunks, and the ones that come after the IDAT chunks.
+  It's necessary to make the distionction between these 3 cases because the PNG
+  standard forces to keep the ordering of unknown chunks compared to the critical
+  chunks, but does not force any other ordering rules.
+
+  info_png.unknown_chunks_data[0] is the chunks before PLTE
+  info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+  info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+  The chunks in these 3 buffers can be iterated through and read by using the same
+  way described in the previous subchapter.
+
+  When using the decoder to decode a PNG, you can make it store all unknown chunks
+  if you set the option settings.remember_unknown_chunks to 1. By default, this
+  option is off (0).
+
+  The encoder will always encode unknown chunks that are stored in the info_png.
+  If you need it to add a particular chunk that isn't known by LodePNG, you can
+  use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+  info_png.unknown_chunks_data[x].
+
+  Chunks that are known by LodePNG should not be added in that way. E.g. to make
+  LodePNG add a bKGD chunk, set background_defined to true and add the correct
+  parameters there instead.
+
+
+  9. compiler support
+  -------------------
+
+  No libraries other than the current standard C library are needed to compile
+  LodePNG. For the C++ version, only the standard C++ library is needed on top.
+  Add the files lodepng.c(pp) and lodepng.h to your project, include
+  lodepng.h where needed, and your program can read/write PNG files.
+
+  It is compatible with C90 and up, and C++03 and up.
+
+  If performance is important, use optimization when compiling! For both the
+  encoder and decoder, this makes a large difference.
+
+  Make sure that LodePNG is compiled with the same compiler of the same version
+  and with the same settings as the rest of the program, or the interfaces with
+  std::vectors and std::strings in C++ can be incompatible.
+
+  CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+  *) gcc and g++
+
+  LodePNG is developed in gcc so this compiler is natively supported. It gives no
+  warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+  version 4.7.1 on Linux, 32-bit and 64-bit.
+
+  *) Clang
+
+  Fully supported and warning-free.
+
+  *) Mingw
+
+  The Mingw compiler (a port of gcc for Windows) should be fully supported by
+  LodePNG.
+
+  *) Visual Studio and Visual C++ Express Edition
+
+  LodePNG should be warning-free with warning level W4. Two warnings were disabled
+  with pragmas though: warning 4244 about implicit conversions, and warning 4996
+  where it wants to use a non-standard function fopen_s instead of the standard C
+  fopen.
+
+  Visual Studio may want "stdafx.h" files to be included in each source file and
+  give an error "unexpected end of file while looking for precompiled header".
+  This is not standard C++ and will not be added to the stock LodePNG. You can
+  disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+  Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+  NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+  VS6, are not guaranteed to work.
+
+  *) Compilers on Macintosh
+
+  LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+  C and C++.
+
+  *) Other Compilers
+
+  If you encounter problems on any compilers, feel free to let me know and I may
+  try to fix it if the compiler is modern and standards complient.
+
+
+  10. examples
+  ------------
+
+  This decoder example shows the most basic usage of LodePNG. More complex
+  examples can be found on the LodePNG website.
+
+  10.1. decoder C++ example
+  -------------------------
+
+  #include "lodepng.h"
+  #include <iostream>
+
+  int main(int argc, char *argv[])
+  {
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+  }
+
+  10.2. decoder C example
+  -----------------------
+
+  #include "lodepng.h"
+
+  int main(int argc, char *argv[])
+  {
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+  }
+
+  11. state settings reference
+  ----------------------------
+
+  A quick reference of some settings to set on the LodePNGState
+
+  For decoding:
+
+  state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+  state.decoder.zlibsettings.custom_...: use custom inflate function
+  state.decoder.ignore_crc: ignore CRC checksums
+  state.decoder.color_convert: convert internal PNG color to chosen one
+  state.decoder.read_text_chunks: whether to read in text metadata chunks
+  state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+  state.info_raw.colortype: desired color type for decoded image
+  state.info_raw.bitdepth: desired bit depth for decoded image
+  state.info_raw....: more color settings, see struct LodePNGColorMode
+  state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+  For encoding:
+
+  state.encoder.zlibsettings.btype: disable compression by setting it to 0
+  state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+  state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+  state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+  state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+  state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+  state.encoder.zlibsettings.custom_...: use custom deflate function
+  state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+  state.encoder.filter_palette_zero: PNG filter strategy for palette
+  state.encoder.filter_strategy: PNG filter strategy to encode with
+  state.encoder.force_palette: add palette even if not encoding to one
+  state.encoder.add_id: add LodePNG identifier and version as a text chunk
+  state.encoder.text_compression: use compressed text chunks for metadata
+  state.info_raw.colortype: color type of raw input image you provide
+  state.info_raw.bitdepth: bit depth of raw input image you provide
+  state.info_raw: more color settings, see struct LodePNGColorMode
+  state.info_png.color.colortype: desired color type if auto_convert is false
+  state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+  state.info_png.color....: more color settings, see struct LodePNGColorMode
+  state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+  12. changes
+  -----------
+
+  The version number of LodePNG is the date of the change given in the format
+  yyyymmdd.
+
+  Some changes aren't backwards compatible. Those are indicated with a (!)
+  symbol.
+
+  *) 17 sep 2017: fix memory leak for some encoder input error cases
+  *) 27 nov 2016: grey+alpha auto color model detection bugfix
+  *) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+  *) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+  the limits of pure C90).
+  *) 08 dec 2015: Made load_file function return error if file can't be opened.
+  *) 24 okt 2015: Bugfix with decoding to palette output.
+  *) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+  *) 23 aug 2014: Reduced needless memory usage of decoder.
+  *) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+  simplicity. Made ColorProfile public.
+  *) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+  *) 22 dec 2013: Power of two windowsize required for optimization.
+  *) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+  *) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+  *) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+  prefix for the custom allocators and made it possible with a new #define to
+  use custom ones in your project without needing to change lodepng's code.
+  *) 28 jan 2013: Bugfix with color key.
+  *) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+  *) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+  (no palette). Better deflate tree encoding. New compression tweak settings.
+  Faster color conversions while decoding. Some internal cleanups.
+  *) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+  *) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+  and made it work with function pointers instead.
+  *) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+  and free functions and toggle #defines from compiler flags. Small fixes.
+  *) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+  *) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+  redundant C++ codec classes. Reduced amount of structs. Everything changed,
+  but it is cleaner now imho and functionality remains the same. Also fixed
+  several bugs and shrunk the implementation code. Made new samples.
+  *) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+  PNG color model and bit depth, based on the amount and type of colors of the
+  raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+  *) 9 okt 2011: simpler hash chain implementation for the encoder.
+  *) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+  *) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+  A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+  better ones (it's quite significant). A setting to do an experimental, slow,
+  brute force search for PNG filter types is added.
+  *) 17 aug 2011 (!): changed some C zlib related function names.
+  *) 16 aug 2011: made the code less wide (max 120 characters per line).
+  *) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+  *) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+  *) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+  to optimize long sequences of zeros.
+  *) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+  LodePNG_InfoColor_canHaveAlpha functions for convenience.
+  *) 7 nov 2010: added LodePNG_error_text function to get error code description.
+  *) 30 okt 2010: made decoding slightly faster
+  *) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+  Reorganized the documentation and the declaration order in the header.
+  *) 08 aug 2010: only changed some comments and external samples.
+  *) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+  *) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+  *) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+  read by ignoring the problem but windows apps couldn't.
+  *) 06 jun 2008: added more error checks for out of memory cases.
+  *) 26 apr 2008: added a few more checks here and there to ensure more safety.
+  *) 06 mar 2008: crash with encoding of strings fixed
+  *) 02 feb 2008: support for international text chunks added (iTXt)
+  *) 23 jan 2008: small cleanups, and #defines to divide code in sections
+  *) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+  *) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+  *) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+  Also various fixes, such as in the deflate and the padding bits code.
+  *) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+  filtering code of encoder.
+  *) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+  C++ wrapper around this provides an interface almost identical to before.
+  Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+  are together in these files but it works both for C and C++ compilers.
+  *) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+  *) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+  *) 09 aug 2007: some VS2005 warnings removed again
+  *) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+  *) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+  *) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+  invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+  *) 02 jun 2007: made the encoder add a tag with version by default
+  *) 27 may 2007: zlib and png code separated (but still in the same file),
+  simple encoder/decoder functions added for more simple usage cases
+  *) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+  moved some examples from here to lodepng_examples.cpp
+  *) 12 may 2007: palette decoding bug fixed
+  *) 24 apr 2007: changed the license from BSD to the zlib license
+  *) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+  *) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+  palettized PNG images. Plus little interface change with palette and texts.
+  *) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+  Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+  *) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+  and supported by the encoder, resulting in smaller PNGs at the output.
+  *) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+  *) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+  greyscale type to 8-bit greyscale with or without alpha.
+  *) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+  to convert to and is more uniform. See the manual for how it works now.
+  *) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+  encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+  at last made the decoder give errors for incorrect Adler32 or Crc.
+  *) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+  *) 29 dec 2006: Added support for encoding images without alpha channel, and
+  cleaned out code as well as making certain parts faster.
+  *) 28 dec 2006: Added "Settings" to the encoder.
+  *) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+  Removed some code duplication in the decoder. Fixed little bug in an example.
+  *) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+  Fixed a bug of the decoder with 16-bit per color.
+  *) 15 okt 2006: Changed documentation structure
+  *) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+  given image buffer, however for now it's not compressed.
+  *) 08 sep 2006: (!) Changed to interface with a Decoder class
+  *) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+  way. Renamed decodePNG to decodePNGGeneric.
+  *) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+  struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+  *) 28 jul 2006: Cleaned the code and added new error checks.
+  Corrected terminology "deflate" into "inflate".
+  *) 23 jun 2006: Added SDL example in the documentation in the header, this
+  example allows easy debugging by displaying the PNG and its transparency.
+  *) 22 jun 2006: (!) Changed way to obtain error value. Added
+  loadFile function for convenience. Made decodePNG32 faster.
+  *) 21 jun 2006: (!) Changed type of info vector to unsigned.
+  Changed position of palette in info vector. Fixed an important bug that
+  happened on PNGs with an uncompressed block.
+  *) 16 jun 2006: Internally changed unsigned into unsigned where
+  needed, and performed some optimizations.
+  *) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+  in LodePNG namespace. Changed the order of the parameters. Rewrote the
+  documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+  *) 22 apr 2006: Optimized and improved some code
+  *) 07 sep 2005: (!) Changed to std::vector interface
+  *) 12 aug 2005: Initial release (C++, decoder only)
+
+
+  13. contact information
+  -----------------------
+
+  Feel free to contact me with suggestions, problems, comments, ... concerning
+  LodePNG. If you encounter a PNG image that doesn't work properly with this
+  decoder, feel free to send it and I'll use it to find and fix the problem.
+
+  My email address is (puzzle the account and domain together with an @ symbol):
+  Domain: gmail dot com.
+  Account: lode dot vandevenne.
+
+
+  Copyright (c) 2005-2017 Lode Vandevenne
+  */
\ No newline at end of file
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/main.cpp b/c_cxx/ort_tutorial/30_syncstreams-cuda/main.cpp
new file mode 100644
index 000000000..54bac7d6c
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/main.cpp
@@ -0,0 +1,210 @@
+// Minimal C++ example for using CopyTensors EP agnostically and using syncstream
+// Model taken from : https://github.com/yakhyo/fast-neural-style-transfer under MIT license
+// Goals:
+//   - Avoid serial CPU <-> GPU transfers at each inference.
+//
+
+#include <cstdlib>
+#include <exception>
+
+#include <cuda_runtime.h>
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <onnxruntime/core/session/onnxruntime_run_options_config_keys.h>
+#include <onnxruntime/core/session/onnxruntime_session_options_config_keys.h>
+#include <stdio.h>
+
+#include "utils.h"
+
+using StreamUniquePtr = std::unique_ptr<OrtSyncStream, std::function<void(OrtSyncStream*)>>;
+using OrtFileString = std::basic_string<ORTCHAR_T>;
+
+static OrtFileString toOrtFileString(const std::filesystem::path& path) {
+  std::string string(path.string());
+  return {string.begin(), string.end()};
+}
+
+// The dimensions of the image file we are loading from disk
+constexpr int LOADED_IMAGE_DIM = 1080;
+// The dimensions of the sub-region we will run inference on. Using whole image for inference.
+constexpr int INFERENCE_IMAGE_DIM = 1080;
+
+// Use pinned (page-locked) memory for the large input buffer to enable true async HtoD copies
+// The output buffer does not need to be pinned
+std::vector<float> cpuOutputFloat(3 * INFERENCE_IMAGE_DIM * INFERENCE_IMAGE_DIM);
+
+int main() {
+  try {
+    OrtApi const& ortApi = Ort::GetApi();
+    Ort::Env ortEnvironment(ORT_LOGGING_LEVEL_WARNING, "HelloOrtNv");
+    Ort::SessionOptions sessionOptions;
+    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    sessionOptions.DisableMemPattern();
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    CHECK_ORT(ortApi.AddFreeDimensionOverrideByName(sessionOptions, "batch_size", 1));
+
+    std::string trtLibPath = get_executable_parent_path() / DLL_NAME("onnxruntime_providers_nv_tensorrt_rtx");
+    CHECK_ORT(
+        ortApi.RegisterExecutionProviderLibrary(ortEnvironment, "NvTensorRtRtx", toOrtFileString(trtLibPath).c_str()));
+
+    std::string cudaLibPath = get_executable_parent_path() / DLL_NAME("onnxruntime_providers_cuda");
+    if (std::filesystem::is_regular_file(cudaLibPath)) {
+      try {
+        CHECK_ORT(ortApi.RegisterExecutionProviderLibrary(ortEnvironment, "Cuda", toOrtFileString(cudaLibPath).c_str()));
+      } catch (std::exception& ex) {
+        LOG("Failed to load Cuda execution provider.");
+      }
+    }
+    sessionOptions.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_GPU);
+
+    const OrtEpDevice* const* ep_devices = nullptr;
+    size_t num_ep_devices;
+    CHECK_ORT(ortApi.GetEpDevices(ortEnvironment, &ep_devices, &num_ep_devices));
+    const OrtEpDevice* trt_ep_device = nullptr;
+    for (uint32_t i = 0; i < num_ep_devices; i++) {
+      if (strcmp(ortApi.EpDevice_EpName(ep_devices[i]), onnxruntime::kNvTensorRTRTXExecutionProvider) == 0) {
+        trt_ep_device = ep_devices[i];
+        break;
+      }
+    }
+    if (trt_ep_device == nullptr) {
+      LOG("Error: could not select TensorRT RTX execution provider!");
+      return EXIT_FAILURE;
+    }
+
+    OrtSyncStream* stream = nullptr;
+    StreamUniquePtr stream_ptr;
+    OrtSyncStream* upload_stream = nullptr;
+    StreamUniquePtr upload_stream_ptr;
+    CHECK_ORT(ortApi.CreateSyncStreamForEpDevice(trt_ep_device, nullptr, &stream));
+    CHECK_ORT(ortApi.CreateSyncStreamForEpDevice(trt_ep_device, nullptr, &upload_stream));
+    stream_ptr = StreamUniquePtr(stream, [ortApi](OrtSyncStream* stream) { ortApi.ReleaseSyncStream(stream); });
+    upload_stream_ptr = StreamUniquePtr(
+        upload_stream, [ortApi](OrtSyncStream* upload_stream) { ortApi.ReleaseSyncStream(upload_stream); });
+
+    size_t stream_addr_val = reinterpret_cast<size_t>(ortApi.SyncStream_GetHandle(stream));
+    auto streamAddress = std::to_string(stream_addr_val);
+    const char* option_keys[] = {"user_compute_stream", "has_user_compute_stream"};
+    const char* option_values[] = {streamAddress.c_str(), "1"};
+    for (size_t i = 0; i < num_ep_devices; i++) {
+      if (strcmp(ortApi.EpDevice_EpName(ep_devices[i]), onnxruntime::kCpuExecutionProvider) != 0)
+        CHECK_ORT(ortApi.SessionOptionsAppendExecutionProvider_V2(sessionOptions, ortEnvironment, &ep_devices[i], 1,
+                                                                  option_keys, option_values, 2));
+    }
+
+    Ort::Session session(ortEnvironment, toOrtFileString(get_executable_parent_path() / "candy.onnx").c_str(),
+                         sessionOptions);
+
+    Ort::MemoryInfo pinned_memory_info("CudaPinned", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemType::OrtMemTypeCPU);
+    Ort::Allocator pinned_allocator(session, pinned_memory_info);
+
+    const size_t input_buffer_elements = 3 * LOADED_IMAGE_DIM * LOADED_IMAGE_DIM;
+    const size_t input_buffer_size = input_buffer_elements * sizeof(float);
+
+    auto deleter = [&](void* p) { pinned_allocator.Free(p); };
+    std::unique_ptr<void, decltype(deleter)> pinned_buffer(pinned_allocator.Alloc(input_buffer_size), deleter);
+    float* cpuInputFloat = static_cast<float*>(pinned_buffer.get());
+
+    size_t num_inputs = session.GetInputCount();
+    const OrtEpDevice* session_epDevices = {nullptr};
+    CHECK_ORT(ortApi.SessionGetEpDeviceForInputs(session, &session_epDevices, num_inputs));
+
+    std::vector<Ort::Value> input_tensors;
+    std::vector<Ort::Value> output_tensors;
+
+    Ort::AllocatorWithDefaultOptions cpu_allocator;
+    Ort::AllocatedStringPtr InputTensorName = session.GetInputNameAllocated(0, cpu_allocator);
+    Ort::AllocatedStringPtr OutputTensorName = session.GetOutputNameAllocated(0, cpu_allocator);
+
+    loadInputImage(cpuInputFloat, (char*)((get_executable_parent_path() / "Input.png").c_str()), false);
+
+    std::vector<int64_t> full_shape{1, 3, LOADED_IMAGE_DIM, LOADED_IMAGE_DIM};
+    std::vector<int64_t> inference_shape{1, 3, INFERENCE_IMAGE_DIM, INFERENCE_IMAGE_DIM};
+
+    Ort::Value full_cpu_tensor = Ort::Value::CreateTensor<float>(
+        pinned_memory_info, cpuInputFloat, input_buffer_elements, full_shape.data(), full_shape.size());
+
+    Ort::Value inference_cpu_output =
+        Ort::Value::CreateTensor<float>(cpu_allocator.GetInfo(), cpuOutputFloat.data(), cpuOutputFloat.size(),
+                                        inference_shape.data(), inference_shape.size());
+
+    OrtMemoryInfo* input_memory_info_agnostic = nullptr;
+    const OrtHardwareDevice* hw_device = ortApi.EpDevice_Device(session_epDevices);
+    uint32_t vID = ortApi.HardwareDevice_VendorId(hw_device);
+    CHECK_ORT(ortApi.CreateMemoryInfo_V2("Input_Agnostic", OrtMemoryInfoDeviceType_GPU, /*vendor_id*/ vID,
+                                         /*device_id*/ 0, OrtDeviceMemoryType_DEFAULT, /*default alignment*/ 0,
+                                         OrtArenaAllocator, &input_memory_info_agnostic));
+    const OrtMemoryInfo* mem_info = input_memory_info_agnostic;
+
+    OrtAllocator* gpu_allocator = nullptr;
+    CHECK_ORT(ortApi.GetSharedAllocator(ortEnvironment, mem_info, &gpu_allocator));
+
+    Ort::Value full_gpu_tensor = Ort::Value::CreateTensor<float>(gpu_allocator, full_shape.data(), full_shape.size());
+    Ort::Value inference_gpu_input_tensor =
+        Ort::Value::CreateTensor<float>(gpu_allocator, inference_shape.data(), inference_shape.size());
+    Ort::Value inference_gpu_output_tensor =
+        Ort::Value::CreateTensor<float>(gpu_allocator, inference_shape.data(), inference_shape.size());
+
+    void* cuda_compute_stream_handle = ortApi.SyncStream_GetHandle(stream);
+
+    const OrtSyncStreamImpl* uploadStreamImpl;
+    OrtSyncNotificationImpl* uploadNotification;
+    OrtEpApi ortEpApi = *ortApi.GetEpApi();
+    uploadStreamImpl = ortEpApi.SyncStream_GetImpl(upload_stream);
+    CHECK_ORT(
+        uploadStreamImpl->CreateNotification(const_cast<OrtSyncStreamImpl*>(uploadStreamImpl), &uploadNotification));
+
+    // This should now be a truly asynchronous copy because the source (cpuInputFloat) is pinned memory.
+    std::vector<const OrtValue*> cpu_src_ptrs = {full_cpu_tensor};
+    std::vector<OrtValue*> gpu_dst_ptrs = {full_gpu_tensor};
+    CHECK_ORT(ortApi.CopyTensors(ortEnvironment, cpu_src_ptrs.data(), gpu_dst_ptrs.data(), upload_stream,
+                                 cpu_src_ptrs.size()));
+
+    CHECK_ORT(uploadNotification->Activate(uploadNotification));
+    CHECK_ORT(uploadNotification->WaitOnDevice(uploadNotification, stream));
+
+    // This D2D copy is on a different stream and will race with the HtoD copy above.
+    const float* full_gpu_ptr = full_gpu_tensor.GetTensorData<float>();
+    float* inference_gpu_ptr = inference_gpu_input_tensor.GetTensorMutableData<float>();
+
+    for (int c = 0; c < 3; ++c) {
+      const float* channel_src_start = full_gpu_ptr + c * (LOADED_IMAGE_DIM * LOADED_IMAGE_DIM);
+      float* channel_dst_start = inference_gpu_ptr + c * (INFERENCE_IMAGE_DIM * INFERENCE_IMAGE_DIM);
+
+      const float* slice_src_start = channel_src_start + (LOADED_IMAGE_DIM - INFERENCE_IMAGE_DIM) * LOADED_IMAGE_DIM +
+                                     (LOADED_IMAGE_DIM - INFERENCE_IMAGE_DIM);
+
+      CHECK_CUDA(cudaMemcpy2DAsync(channel_dst_start, INFERENCE_IMAGE_DIM * sizeof(float), slice_src_start,
+                                   LOADED_IMAGE_DIM * sizeof(float), INFERENCE_IMAGE_DIM * sizeof(float),
+                                   INFERENCE_IMAGE_DIM, cudaMemcpyDeviceToDevice,
+                                   static_cast<cudaStream_t>(cuda_compute_stream_handle)));
+    }
+
+    input_tensors.push_back(std::move(inference_gpu_input_tensor));
+    output_tensors.push_back(std::move(inference_gpu_output_tensor));
+
+    Ort::IoBinding iobinding(session);
+    iobinding.BindInput(InputTensorName.get(), input_tensors[0]);
+    iobinding.BindOutput(OutputTensorName.get(), output_tensors[0]);
+
+    session.Run(Ort::RunOptions{}, iobinding);
+
+    std::vector<const OrtValue*> output_src_tensor_ptrs = {output_tensors[0]};
+    std::vector<OrtValue*> output_dst_tensor_ptrs = {inference_cpu_output};
+    CHECK_ORT(ortApi.CopyTensors(ortEnvironment, output_src_tensor_ptrs.data(), output_dst_tensor_ptrs.data(),
+                                 upload_stream, 1));
+
+    saveOutputImage(cpuOutputFloat.data(), (char*)((get_executable_parent_path() / "output.png").c_str()), false);
+
+    uploadNotification->Release(uploadNotification);
+    ortApi.ReleaseMemoryInfo(input_memory_info_agnostic);
+  } catch (const Ort::Exception& e) {
+    printf("ONNX Runtime exception caught: %s\n", e.what());
+    return -1;
+  } catch (const std::exception& e) {
+    printf("Runtime exception caught: %s\n", e.what());
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.cpp b/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.cpp
new file mode 100644
index 000000000..df9e82fae
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.cpp
@@ -0,0 +1,130 @@
+#include "utils.h"
+#include "half.hpp"
+#include "lodepng/lodepng.h"
+
+#ifdef _WIN32
+#include <windows.h>  // For GetModuleFileNameW
+#elif __APPLE__
+#include <limits.h>       // For PATH_MAX or similar
+#include <mach-o/dyld.h>  // For _NSGetExecutablePath
+#elif __linux__
+#include <limits.h>  // For PATH_MAX
+#include <unistd.h>  // For readlink
+#endif
+
+std::filesystem::path get_executable_parent_path() { return get_executable_path().parent_path(); }
+
+std::filesystem::path get_executable_path() {
+#ifdef _WIN32
+  // Windows: Use GetModuleFileNameW for wide characters
+  std::vector<wchar_t> pathBuf(MAX_PATH);
+  DWORD length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+
+  while (length == pathBuf.size()) {
+    pathBuf.resize(pathBuf.size() * 2);
+    length = GetModuleFileNameW(NULL, pathBuf.data(), pathBuf.size());
+  }
+
+  if (length == 0) {
+    std::cerr << "Error: GetModuleFileNameW failed with error "
+              << GetLastError() << std::endl;
+    return {};
+  }
+  return std::filesystem::path(pathBuf.data());
+
+#elif __APPLE__
+  // macOS: Use _NSGetExecutablePath
+  std::vector<char> pathBuf(PATH_MAX);
+  uint32_t length = pathBuf.size();
+  if (_NSGetExecutablePath(pathBuf.data(), &length) != 0) {
+    // Buffer was too small, resize and try again
+    pathBuf.resize(length + 1);  // +1 for null terminator
+    _NSGetExecutablePath(pathBuf.data(), &length);
+  }
+  return std::filesystem::canonical(
+      pathBuf.data());  // canonical to resolve symlinks
+
+#elif __linux__
+  // Linux: Use /proc/self/exe symlink
+  return std::filesystem::canonical(
+      std::filesystem::read_symlink("/proc/self/exe"));
+#endif
+}
+
+using half_float::half;
+constexpr int image_dim = 1080;
+
+void loadInputImage(void* pData, char* imageFileName, bool fp16) {
+  half* hData = (half*)pData;
+  float* fData = (float*)pData;
+
+  unsigned char* image;
+  unsigned int width, height;
+  unsigned int error =
+      lodepng_decode32_file(&image, &width, &height, imageFileName);
+  if (error) {
+    printf("\nFailed to load the input image. Exiting\n");
+    exit(0);
+  }
+
+  if (width != image_dim || height != image_dim) {
+    printf("\nImage not of right size. Exiting\n");
+    exit(0);
+  }
+
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      unsigned char r = image[(y * width + x) * 4 + 0];
+      unsigned char g = image[(y * width + x) * 4 + 1];
+      unsigned char b = image[(y * width + x) * 4 + 2];
+
+      if (fp16) {
+        hData[0 * width * height + y * width + x] = (half)b;
+        hData[1 * width * height + y * width + x] = (half)g;
+        hData[2 * width * height + y * width + x] = (half)r;
+      } else {
+        fData[0 * width * height + y * width + x] = (float)b;
+        fData[1 * width * height + y * width + x] = (float)g;
+        fData[2 * width * height + y * width + x] = (float)r;
+      }
+    }
+
+  free(image);
+}
+
+unsigned char clampAndConvert(float val) {
+  if (val < 0)
+    val = 0;
+  if (val > 255)
+    val = 255;
+  return (unsigned char)val;
+}
+
+void saveOutputImage(void* pData, char* imageFileName, bool fp16) {
+  half* hData = (half*)pData;
+  float* fData = (float*)pData;
+
+  unsigned int width = image_dim, height = image_dim;  // hardcoded in the model
+
+  std::vector<unsigned char> image(width * height * 4);
+  for (uint32_t y = 0; y < height; y++)
+    for (uint32_t x = 0; x < width; x++) {
+      float b, g, r;
+      if (fp16) {
+        b = (float)hData[0 * width * height + y * width + x];
+        g = (float)hData[1 * width * height + y * width + x];
+        r = (float)hData[2 * width * height + y * width + x];
+      } else {
+        b = fData[0 * width * height + y * width + x];
+        g = fData[1 * width * height + y * width + x];
+        r = fData[2 * width * height + y * width + x];
+      }
+
+      image[(y * width + x) * 4 + 0] = clampAndConvert(r);
+      image[(y * width + x) * 4 + 1] = clampAndConvert(g);
+      image[(y * width + x) * 4 + 2] = clampAndConvert(b);
+      image[(y * width + x) * 4 + 3] = 255;
+    }
+
+  lodepng_encode32_file(imageFileName, &image[0], width, height);
+}
diff --git a/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.h b/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.h
new file mode 100644
index 000000000..f71d1fdf1
--- /dev/null
+++ b/c_cxx/ort_tutorial/30_syncstreams-cuda/utils.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <filesystem>
+#include <format>
+#include <iostream>
+void loadInputImage(void* pData, char* imageFileName, bool fp16);
+void saveOutputImage(void* pData, char* imageFileName, bool fp16);
+
+#define DLL_NAME(name) (DLL_PREFIX name DLL_SUFFIX)
+#if _WIN32
+#define DLL_PREFIX ""
+#define DLL_SUFFIX ".dll"
+#else
+#define DLL_PREFIX "lib"
+#define DLL_SUFFIX ".so"
+#endif
+#define LOG(...) std::cout << std::format(__VA_ARGS__) << "\n"
+#define THROW_ERROR(...) \
+  LOG(__VA_ARGS__);      \
+  throw std::runtime_error(std::format(__VA_ARGS__));
+#define CHECK_ORT(call)                                         \
+  {                                                             \
+    auto status = (call);                                       \
+    if (status != nullptr) {                                    \
+      THROW_ERROR("{}", Ort::GetApi().GetErrorMessage(status)); \
+    }                                                           \
+  }
+
+#define STRINGFY(s) _STRINGFY(s)
+#define _STRINGFY(s) #s
+#define CHECK_CUDA(call)                                                                      \
+  {                                                                                           \
+    auto status = (call);                                                                     \
+    if (status != cudaSuccess) {                                                              \
+      THROW_ERROR("Failed to execute CUDA call. Error code {}", STRINGFY(call), int(status)); \
+    }                                                                                         \
+  }
+
+std::filesystem::path get_executable_path();
+std::filesystem::path get_executable_parent_path();
diff --git a/c_cxx/ort_tutorial/40_ep-context/CMakeLists.txt b/c_cxx/ort_tutorial/40_ep-context/CMakeLists.txt
new file mode 100644
index 000000000..ff8829482
--- /dev/null
+++ b/c_cxx/ort_tutorial/40_ep-context/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.16)
+project(ONNXRuntimeSample)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
+include(onnxruntimesetup)
+
+# Set C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Add executable for sample.cpp (file-based)
+add_executable(sample sample.cpp)
+target_link_libraries(sample PRIVATE onnxruntime_interface)
+
+# Add executable for sample_buffer.cpp (buffer-based)
+add_executable(sample_buffer sample_buffer.cpp)
+target_link_libraries(sample_buffer PRIVATE onnxruntime_interface)
diff --git a/c_cxx/ort_tutorial/40_ep-context/README.md b/c_cxx/ort_tutorial/40_ep-context/README.md
new file mode 100644
index 000000000..2228ad966
--- /dev/null
+++ b/c_cxx/ort_tutorial/40_ep-context/README.md
@@ -0,0 +1,101 @@
+# ONNX Runtime EP Context Samples
+
+This repository provides **sample programs** (C++ and Python) that demonstrate how to use the **ONNX Runtime Execution Provider (EP) Context API** for NVIDIA TensorRT RTX Execution Provider (`NvTensorRTRTXExecutionProvider`).
+
+## What is EP Context?
+
+ONNX Runtime introduced the concept of **EP Context** to allow:
+
+* **Pre-compilation of models** with a specific Execution Provider (EP) (e.g., NVIDIA TensorRT RTX).
+* Faster **loading of compiled models** by reusing previously generated execution engines.
+* Two storage modes:
+  * **Embedded Mode** : Compiled binary is embedded inside the ONNX file.
+  * **External Mode** : Compiled binary is stored as an external file alongside the ONNX.
+* Two ways of loading the models
+  * **Disk Load** : Load the model files from direct disk access.
+  * **Buffer Load** : Loads the models form the memory buffer access for the models.
+
+This makes it easier to deploy optimized models across multiple environments.
+
+Compilation is currently only supported by execution providers that compile subgraphs.
+More traditional execution providers like CUDA or CPU currently do not support this feature.
+
+---
+
+## Samples Included
+
+### C++ sample.cpp (File-based Model)
+
+* Loads an ONNX model from disk.
+* Compiles it with the selected EP.
+* Saves compiled ONNX file.
+* Loads the compiled model and measures load times.
+
+### C++ sample_buffer.cpp (Buffer-based Model with External Initializers)
+
+* Loads model and weights directly into memory buffers.
+* Registers **external initializers** (for `.onnx.data` files).
+* Compiles the model to an in-memory buffer.
+* Loads the compiled EP Context model from memory.
+
+---
+
+### Python sample.py (File based Model)
+
+An equivalent Python sample is located at [../../python/EP_Context/](../../python/EP_Context/)
+
+* Uses ONNX Runtime  **ModelCompiler API**.
+* Demonstrates file-based compilation (with embedded/external modes).
+* Measures load time for normal vs. compiled models.
+
+---
+
+## How to Run
+
+### C++ sample.cpp
+
+./sample.exe `<input model path my_model.onnx> <output model path model_ctx.onnx> <embed mode 0> <EP selection NvTensorRTRTXExecutionProvider> `
+
+* `embed_mode`: `0` = external, `1` = embedded.
+* `provider`: execution provider name (default: `NvTensorRTRTXExecutionProvider`).
+
+This sample currently assumes fixed input and output shapes for the ONNX file used.
+
+---
+
+### C++ sample_buffer.cpp
+
+./sample_buffer.exe `<input model path my_model.onnx> <input path to model data model.onnx.data> <output model path model_ctx.onnx> <external data file name model.onnx.data> <embed mode 0> <EP selection NvTensorRTRTXExecutionProvider>`
+
+* `embed_mode`: `0` = external, `1` = embedded.
+* `provider`: execution provider name (default: `NvTensorRTRTXExecutionProvider`).
+
+---
+
+### Python
+
+See [../../python/40_ep-context/](../../python/40_ep-context/)
+
+python sample_compile.py -i `<input model path my_model.onnx>` -o `<output context model path model_ctx.onnx>` -p `<EP selection NvTensorRTRTXExecutionProvider>` -e `<embed mode False>`
+
+---
+
+## Performance Improvement (RTX 5090)
+
+|Model                                 | Normal Load Time (sec) | Compile Time (sec) | EP Context Load (sec) | EP Context + Cache (sec) |
+|--------------------------------------|------------------------|--------------------|-----------------------|--------------------------|
+| Deepseek qwen 14B - INT4             | 31.2312                | 34.9162            | 4.95345               | 3.7258                   |
+| Llama-3.1-8B-Instruct - FP16         | 28.264                 | 30.8706            | 6.77561               | 6.0288                   |
+| Stable Diffusion 3.5 - transformer   | 107.296                | 121.263            | 24.8112               | 9.07548                  |
+
+---
+
+## Build steps
+
+### Prerequisites
+
+- CMake 3.16 or higher
+- Visual Studio 2019/2022 (Windows) or GCC/Clang (Linux)
+- ONNX Runtime with NV TensorRT RTX support
+- CUDA and NV TensorRT RTX (for NV TensorRT RTX execution provider)
+- Build TRT RTX EP from this doc [Build TRT RTX EP](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider.html#build-from-source)
diff --git a/c_cxx/ort_tutorial/40_ep-context/sample.cpp b/c_cxx/ort_tutorial/40_ep-context/sample.cpp
new file mode 100644
index 000000000..9b3a370d0
--- /dev/null
+++ b/c_cxx/ort_tutorial/40_ep-context/sample.cpp
@@ -0,0 +1,170 @@
+#include <iostream>
+#include <string>
+#include <chrono>
+#include <filesystem>
+#include <unordered_map>
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <onnxruntime/core/session/onnxruntime_run_options_config_keys.h>
+#include <onnxruntime/core/session/onnxruntime_session_options_config_keys.h>
+
+/**
+ * @brief A generic utility to measure the execution time of a function.
+ *
+ * This template function measures the time taken to execute a callable object
+ * (e.g., a lambda function) and returns the duration in seconds.
+ *
+ * @tparam Func The type of the function to measure.
+ * @param func The function to execute and time.
+ * @return The duration of the function's execution in seconds.
+ */
+template <typename Func>
+double MeasureTime(Func&& func) {
+    auto start = std::chrono::high_resolution_clock::now();
+    func();
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    return diff.count();
+}
+
+int main(int argc, char* argv[]) {
+    // Check for correct command-line arguments.
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <input_model.onnx> <compiled_model_output.onnx> [embed_mode] [provider_name]" << std::endl;
+        std::cerr << "  embed_mode: 0 = embed engine in EP context (default), 1 = external engine" << std::endl;
+        std::cerr << "  provider_name: The name of the execution provider (e.g., 'NvTensorRtRtx')" << std::endl;
+        return 1;
+    }
+
+    // Parse command-line arguments.
+    std::filesystem::path input_model_path = argv[1];
+    std::filesystem::path output_model_path = argv[2];
+    int embed_mode = 0; // Default: embed engine
+    std::string provider = onnxruntime::kNvTensorRTRTXExecutionProvider; // Default provider
+
+    // Automatically create a runtime cache directory for demonstration.
+    const std::filesystem::path runtime_cache_dir = "ort_runtime_cache";
+
+    // Delete existing runtime cache directory to ensure clean performance metrics
+    try {
+        if (std::filesystem::exists(runtime_cache_dir)) {
+            std::filesystem::remove_all(runtime_cache_dir);
+        }
+    }
+    catch (const std::filesystem::filesystem_error& ex) {
+        std::cerr << "WARNING: Failed to delete runtime cache directory: " << ex.what() << std::endl;
+        std::cerr << "Performance metrics may not be accurate due to existing cache." << std::endl;
+    }
+
+    if (argc >= 4) {
+        try {
+            embed_mode = std::stoi(argv[3]);
+        }
+        catch (const std::invalid_argument&) {
+            std::cerr << "ERROR: Invalid embed_mode value. Must be an integer." << std::endl;
+            return 1;
+        }
+        if (embed_mode != 0 && embed_mode != 1) {
+            std::cerr << "ERROR: Invalid embed_mode value. Must be 0 or 1." << std::endl;
+            return 1;
+        }
+    }
+
+    if (argc >= 5) {
+        provider = argv[4];
+    }
+
+    std::cout << "-----------------------------------------------" << std::endl;
+    std::cout << "ONNX Runtime TensorRT Compilation Example" << std::endl;
+    std::cout << "-----------------------------------------------" << std::endl;
+    std::cout << "> Input Model Path:  " << input_model_path << std::endl;
+    std::cout << "> Output Model Path: " << output_model_path << std::endl;
+    std::cout << "> Embed Mode:        " << (embed_mode == 1 ? "Embedded" : "External") << std::endl;
+    std::cout << "> Execution Provider: " << provider << std::endl;
+    std::cout << "-----------------------------------------------" << std::endl;
+
+    try {
+        // Create an ONNX Runtime environment.
+        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ONNXRuntime-EP-Context-Example");
+
+        // Block for normal onnx load and time it
+        {
+            // Create session options
+            Ort::SessionOptions session_options;
+            session_options.AppendExecutionProvider(provider.c_str());
+
+            std::cout << "> Loading original ONNX model from disk..." << std::endl;
+            double load_time_normal = MeasureTime([&]() {
+                Ort::Session session(env, input_model_path.c_str(), session_options);
+            });
+            std::cout << "> Original session load time: " << load_time_normal << " sec" << std::endl;
+        }
+
+        // Block for AOT compilation and time it
+        {
+            // Create session options
+            Ort::SessionOptions session_options;
+            std::unordered_map<std::string, std::string> provider_options;
+            provider_options["nv_runtime_cache_path"] = runtime_cache_dir.string();
+            session_options.AppendExecutionProvider(provider.c_str(), provider_options);
+
+            std::cout << "> Compiling model with " << provider << "..." << std::endl;
+
+            // Setup compilation options
+            Ort::ModelCompilationOptions compile_options(env, session_options);
+            compile_options.SetEpContextEmbedMode(embed_mode);
+            compile_options.SetInputModelPath(input_model_path.c_str());
+            compile_options.SetOutputModelPath(output_model_path.c_str());
+
+            double compile_time = MeasureTime([&]() {
+                Ort::Status status = Ort::CompileModel(env, compile_options);
+                if (!status.IsOK()) {
+                    throw Ort::Exception(status.GetErrorMessage(), ORT_FAIL);
+                }
+            });
+            std::cout << "> Model compiled successfully!" << std::endl;
+            std::cout << "> Compile time: " << compile_time << " sec" << std::endl;
+            std::cout << "> Compiled model saved at " << output_model_path << std::endl;
+        }
+
+        // Block for loading compiled model and time it
+        {
+            // Create session options
+            Ort::SessionOptions session_options;
+            session_options.AppendExecutionProvider(provider.c_str());
+
+
+            std::cout << "> Loading compiled model from disk..." << std::endl;
+            double load_time_compiled = MeasureTime([&]() {
+                Ort::Session session(env, output_model_path.c_str(), session_options);
+            });
+            std::cout << "> Context model session load time: " << load_time_compiled << " sec" << std::endl;
+        }
+
+        // Block for JIT compilation and time it
+        {
+            // Create session options
+            Ort::SessionOptions session_options;
+            std::unordered_map<std::string, std::string> provider_options;
+            provider_options["nv_runtime_cache_path"] = runtime_cache_dir.string();
+            session_options.AppendExecutionProvider(provider.c_str(), provider_options);
+
+            double jit_time = MeasureTime([&]() {
+                Ort::Session session(env, output_model_path.c_str(), session_options);
+            });
+            std::cout << "> Context model session load time with runtime cache: " << jit_time << " sec" << std::endl;
+            std::cout << "> Runtime cache has been populated at: " << runtime_cache_dir << std::endl;
+        }
+    }
+    catch (const Ort::Exception& ex) {
+        std::cerr << "\nONNX Runtime Exception: " << ex.what() << std::endl;
+        return 1;
+    }
+    catch (const std::exception& ex) {
+        std::cerr << "\nStandard Exception: " << ex.what() << std::endl;
+        return 1;
+    }
+
+    std::cout << "\nProgram finished successfully." << std::endl;
+    return 0;
+}
diff --git a/c_cxx/ort_tutorial/40_ep-context/sample_buffer.cpp b/c_cxx/ort_tutorial/40_ep-context/sample_buffer.cpp
new file mode 100644
index 000000000..030cd5fbd
--- /dev/null
+++ b/c_cxx/ort_tutorial/40_ep-context/sample_buffer.cpp
@@ -0,0 +1,174 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <unordered_map>
+#include <filesystem>
+#include <onnxruntime/core/graph/constants.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <onnxruntime/core/session/onnxruntime_run_options_config_keys.h>
+#include <onnxruntime/core/session/onnxruntime_session_options_config_keys.h>
+#include <onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h>
+
+namespace fs = std::filesystem;
+
+// Utility: read file into buffer
+std::vector<uint8_t> LoadFileToBuffer(const fs::path& filename) {
+    std::ifstream file(filename, std::ios::binary | std::ios::ate);
+    if (!file) throw std::runtime_error("Failed to open file: " + filename.string());
+
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<uint8_t> buffer(size);
+    if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
+        throw std::runtime_error("Failed to read file: " + filename.string());
+    }
+    return buffer;
+}
+
+// Utility for timing
+template <typename Func>
+double MeasureTime(Func&& func) {
+    auto start = std::chrono::high_resolution_clock::now();
+    func();
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    return diff.count();  // seconds
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 5) {
+        std::cerr << "Usage: " << argv[0]
+            << " <input_model.onnx> <weights_file.onnx.data> <compiled_model_output.onnx> <external_data_filename> [embed_mode] [provider]"
+            << std::endl;
+        std::cerr << "  external_data_filename: The name used for external data in the model (e.g., 'model.onnx.data')" << std::endl;
+        return 1;
+    }
+
+    fs::path input_model_path = argv[1];
+    fs::path weights_model_path = argv[2];
+    fs::path output_model_path = argv[3];
+    fs::path external_data_filename_path = argv[4];
+    int embed_mode = 0;
+    std::string provider = onnxruntime::kNvTensorRTRTXExecutionProvider;
+
+    if (argc >= 6) {
+        embed_mode = std::stoi(argv[5]);
+        if (embed_mode != 0 && embed_mode != 1) {
+            std::cerr << "Invalid embed_mode value. Must be 0 or 1." << std::endl;
+            return 1;
+        }
+    }
+
+    if (argc >= 7) {
+        provider = std::string(argv[6]);
+        try {
+            Ort::SessionOptions test_options;
+            // A simple pre-check to see if the provider is available
+            test_options.AppendExecutionProvider(provider.c_str());
+        }
+        catch (const Ort::Exception& ex) {
+            std::cerr << "ERROR: Provider '" << provider
+                << "' is not available or invalid: "
+                << ex.what() << std::endl;
+            return 1; // EXIT IMMEDIATELY, no fallback
+        }
+    }
+
+    std::cout << "> Embed mode set to: " << embed_mode << std::endl;
+    std::cout << "> Provider set to: " << provider << std::endl;
+    std::cout << "> External data filename: " << external_data_filename_path << std::endl;
+
+    try {
+        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "EPContextExample");
+
+        Ort::SessionOptions session_options;
+
+        // Configure execution provider based on provider type
+        if (provider == onnxruntime::kNvTensorRTRTXExecutionProvider) {
+            // Using the new provider option for this specific EP
+            std::unordered_map<std::string, std::string> option_map{
+                {onnxruntime::nv::provider_option_names::kUseExternalDataInitializer, "1"}
+            };
+            session_options.AppendExecutionProvider(provider.c_str(), option_map);
+            std::cout << "> Using Execution Provider: " << provider << " with options." << std::endl;
+        }
+        else {
+            // Fallback for other providers that might not have or need options
+            session_options.AppendExecutionProvider(provider.c_str());
+            std::cout << "> Using Execution Provider: " << provider << " (no options)." << std::endl;
+        }
+
+        // --- Step 1: Load model and weights into buffers ---
+        std::vector<uint8_t> model_buffer = LoadFileToBuffer(input_model_path);
+        std::vector<uint8_t> weights_buffer = LoadFileToBuffer(weights_model_path);
+
+        // Register external weights explicitly (needed in buffer mode)
+        std::vector<std::basic_string<ORTCHAR_T>> file_names = { external_data_filename_path.native() };
+        std::vector<char*> file_buffers_data = { static_cast<char*>(static_cast<void*>(weights_buffer.data())) };
+        std::vector<size_t> file_buffers_size = { weights_buffer.size() };
+
+        session_options.AddExternalInitializersFromFilesInMemory(
+            file_names,
+            file_buffers_data,
+            file_buffers_size
+        );
+
+        // --- Step 2: Regular ONNX load (buffer mode) ---
+        std::cout << "> Loading regular onnx (buffer)..." << std::endl;
+        double load_time_normal = MeasureTime([&]() {
+            Ort::Session session(env, model_buffer.data(), model_buffer.size(), session_options);
+            });
+        std::cout << "> Session load time: " << load_time_normal << " sec" << std::endl;
+
+        // --- Step 3: Compile model from buffer ---
+        std::cout << "> Compiling model (buffer)..." << std::endl;
+        void* output_buffer_data = nullptr;
+        size_t output_buffer_size = 0;
+
+        // Setup compilation options
+        Ort::ModelCompilationOptions compile_options(env, session_options);
+        compile_options.SetEpContextEmbedMode(embed_mode);
+        compile_options.SetInputModelFromBuffer(model_buffer.data(), model_buffer.size());
+        compile_options.SetOutputModelBuffer(
+            Ort::AllocatorWithDefaultOptions(),
+            &output_buffer_data,
+            &output_buffer_size
+        );
+
+        // Actual compilation
+        double compile_time = MeasureTime([&]() {
+            Ort::Status status = Ort::CompileModel(env, compile_options);
+            if (!status.IsOK()) {
+                throw Ort::Exception(status.GetErrorMessage(), ORT_FAIL);
+            }
+            });
+        std::cout << "> Compiled successfully!" << std::endl;
+        std::cout << "> Compile time: " << compile_time << " sec" << std::endl;
+        std::cout << "> Compiled model buffer size: " << output_buffer_size << " bytes" << std::endl;
+
+        // --- Step 4: Load compiled model from buffer ---
+        std::cout << "> Loading EP context model (buffer)..." << std::endl;
+        double load_time_compiled = MeasureTime([&]() {
+            Ort::Session compiled_session(env,
+                reinterpret_cast<uint8_t*>(output_buffer_data),
+                output_buffer_size,
+                session_options);
+            });
+        std::cout << "> Session load time: " << load_time_compiled << " sec" << std::endl;
+
+        // Note: free output_buffer_data if allocator requires it
+    }
+    catch (const Ort::Exception& ex) {
+        std::cerr << "ONNX Runtime error: " << ex.what() << std::endl;
+        return 1;
+    }
+    catch (const std::exception& ex) {
+        std::cerr << "Standard exception: " << ex.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/c_cxx/ort_tutorial/CMakeLists.txt b/c_cxx/ort_tutorial/CMakeLists.txt
new file mode 100644
index 000000000..271504931
--- /dev/null
+++ b/c_cxx/ort_tutorial/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.20)
+project(onnxruntime_c_cxx_samples)
+
+
+if (MSVC)
+    # warning level 4
+    add_compile_options(/W4)
+else()
+    # additional warnings
+    add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+add_subdirectory(10_ep-device-selection)
+add_subdirectory(20_devicetensors-datatransfer)
+add_subdirectory(30_syncstreams-cuda)
+add_subdirectory(40_ep-context)
diff --git a/c_cxx/ort_tutorial/README.md b/c_cxx/ort_tutorial/README.md
new file mode 100644
index 000000000..9eaf0b519
--- /dev/null
+++ b/c_cxx/ort_tutorial/README.md
@@ -0,0 +1,16 @@
+# C/C++ ORT Tutorial
+
+These C/C++ samples are intended to be run built and run independently.
+Please see the README's of the respective samples to get started with the API.
+
+## Building the samples
+
+To build the samples, you will need to have CMake installed and a functional C++ compiler.
+To compile the samples, with support for ONNX Runtime and TensorRT RTX EP, you can use the following commands:
+```
+cmake -B build -S . -DONNX_RUNTIME_PATH=path/to/onnxruntime> -DTRTRTX_RUNTIME_PATH=<path/to/TRTRTX/libs> 
+cmake --build build --config Release
+```
+
+- `ONNX_RUNTIME_PATH` should be set to the directory containing the ONNX Runtime headers inside `include/` and libraries inside `lib/`.
+- (optional)`TRTRTX_RUNTIME_PATH` will make it easier to run TensorRT RTX EP since it ensures all libraries are copied to the executable directory. If this is not used they are required to be on the system path. In addition, TensorRT RTX EP required the CUDA Runtime to be on the system path as well.
diff --git a/c_cxx/ort_tutorial/cmake/onnxruntimesetup.cmake b/c_cxx/ort_tutorial/cmake/onnxruntimesetup.cmake
new file mode 100644
index 000000000..a83125eb4
--- /dev/null
+++ b/c_cxx/ort_tutorial/cmake/onnxruntimesetup.cmake
@@ -0,0 +1,75 @@
+if(TARGET onnxruntime_interface)
+else()
+  set(ONNX_RUNTIME_PATH "$ENV{ONNX_RUNTIME_PATH}" CACHE PATH "Where to find ONNX runtime")
+  if("${ONNX_RUNTIME_PATH}" STREQUAL "")
+    message(FATAL_ERROR "Please specify cmake variable ONNX_RUNTIME_PATH! E.g. via -DONNX_RUNTIME_PATH=/path/to/onnxruntime")
+  endif()
+  set(TRTRTX_RUNTIME_PATH "$ENV{TRTRTX_RUNTIME_PATH}" CACHE PATH "Where to find TensorRT RTX")
+  if("${TRTRTX_RUNTIME_PATH}" STREQUAL "")
+    message(WARNING "Please specify cmake variable TRTRTX_RUNTIME_PATH! E.g. via -DTRTRTX_RUNTIME_PATH=/path/to/tensorrt_rtx. This will ensure all libraries are copied to the execution directory.")
+  endif()
+
+  find_library(ONNXRUNTIME_LIB onnxruntime HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+  find_library(ONNXRUNTIME_PROVIDERS_SHARED_LIB onnxruntime_providers_shared HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+  find_library(ONNXRUNTIME_TRT_EP_LIB onnxruntime_providers_nv_tensorrt_rtx HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+  find_library(ONNXRUNTIME_CUDA_EP_LIB onnxruntime_providers_cuda HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib)
+  if(WIN32)
+    find_file(ONNXRUNTIME_DLL onnxruntime.dll HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+    find_file(ONNXRUNTIME_PROVIDERS_SHARED_DLL onnxruntime_providers_shared.dll HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+    find_file(ONNXRUNTIME_TRT_EP_DLL onnxruntime_providers_n    v_tensorrt_rtx.dll HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib REQUIRED)
+    find_file(ONNXRUNTIME_CUDA_EP_DLL onnxruntime_providers_cuda.dll HINTS ${ONNX_RUNTIME_PATH} ${ONNX_RUNTIME_PATH}/lib)
+
+    find_file(TRTRTX_DLL NAMES tensorrt_rtx_1_1.dll tensorrt_rtx_1_2.dll tensorrt_rtx_1_3.dll HINTS ${TRTRTX_RUNTIME_PATH} ${TRTRTX_RUNTIME_PATH}/lib)
+  else()
+    find_library(TRTRTX_LIB NAMES tensorrt_rtx tensorrt_rtx_1_1 tensorrt_rtx_1_2 tensorrt_rtx_1_3 HINTS ${TRTRTX_RUNTIME_PATH} ${TRTRTX_RUNTIME_PATH}/lib)
+  endif()
+
+  find_path(ONNXRUNTIME_INCLUDE
+      onnxruntime/core/session/onnxruntime_cxx_api.h
+      HINTS ${ONNX_RUNTIME_PATH}/include
+      REQUIRED)
+  add_library(onnxruntime_interface INTERFACE)
+  target_include_directories(onnxruntime_interface SYSTEM INTERFACE ${ONNXRUNTIME_INCLUDE})
+  target_link_libraries(onnxruntime_interface INTERFACE ${ONNXRUNTIME_LIB} ${ONNXRUNTIME_PROVIDERS_SHARED_LIB})
+
+  message(STATUS "ONNX runtime include \"${ONNXRUNTIME_INCLUDE}\"")
+  message(STATUS "ONNX runtime lib \"${ONNXRUNTIME_LIB}\"")
+endif()
+
+set(RUNTIME_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+macro (copy_file_to_bin_dir file)
+    get_property(is_multi_config GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+    if(${is_multi_config})
+      foreach(config IN LISTS CMAKE_CONFIGURATION_TYPES)
+          set(OUTPUT_DIR ${RUNTIME_DIRECTORY}/${config})
+          # Create the configuration-specific directory
+          execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${OUTPUT_DIR}")
+          configure_file(${file} ${OUTPUT_DIR} COPYONLY)
+      endforeach()
+  else()
+      configure_file(${file} ${RUNTIME_DIRECTORY} COPYONLY)
+  endif()
+endmacro()
+
+if(WIN32)
+    copy_file_to_bin_dir(${ONNXRUNTIME_DLL})
+    copy_file_to_bin_dir(${ONNXRUNTIME_PROVIDERS_SHARED_DLL})
+    copy_file_to_bin_dir(${ONNXRUNTIME_TRT_EP_DLL})
+    if (TRTRTX_DLL)
+        copy_file_to_bin_dir(${TRTRTX_DLL})
+    endif()
+    if(ONNXRUNTIME_CUDA_EP_DLL)
+        copy_file_to_bin_dir(${ONNXRUNTIME_CUDA_EP_DLL})
+    endif()
+else()
+    copy_file_to_bin_dir(${ONNXRUNTIME_LIB})
+    copy_file_to_bin_dir(${ONNXRUNTIME_PROVIDERS_SHARED_LIB})
+    copy_file_to_bin_dir(${ONNXRUNTIME_TRT_EP_LIB})
+    if (TRTRTX_DLL)
+        copy_file_to_bin_dir(${TRTRTX_LIB})
+    endif()
+    if(ONNXRUNTIME_CUDA_EP_LIB)
+        copy_file_to_bin_dir(${ONNXRUNTIME_CUDA_EP_LIB})
+    endif()
+endif()
diff --git a/python/README.md b/python/README.md
index b4e653f50..4e5897e72 100644
--- a/python/README.md
+++ b/python/README.md
@@ -2,7 +2,8 @@
 
 ## API 
 
-[Run the ONNX Runtime session creation and inference API](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/api)
+The [api directory](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/api) contains samples that demonstrate how to use the ONNX Runtime Python API. 
+These samples show very minimal API usage that is not execution provider specific.
 
 ## OpenVINO Execution Provider
 
diff --git a/python/api/README.md b/python/api/README.md
new file mode 100644
index 000000000..6df25a930
--- /dev/null
+++ b/python/api/README.md
@@ -0,0 +1,25 @@
+# Python API Samples
+
+This directory contains sample scripts demonstrating various ONNX Runtime Python API features:
+
+- `getting_started.py`  
+  Introduces the basics of exporting a simple PyTorch model to ONNX, running inference with ONNX Runtime, and handling inputs/outputs as NumPy arrays.
+
+- `compile_api.py`  
+  Shows how to programmatically compile an ONNX model for a specific execution provider (e.g., TensorRT RTX) to an [EP context](https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html) ONNX. The sample measures model load and compile times to demonstrate performance improvements and has the option to specify an input model.
+  - For `NvTensorRTRTXExecutionProvider` try adding the provider option for a runtime cache (`-p NvTensorRTRTXExecutionProvider -popt "nv_runtime_cache_path=./cache"`) which will further increase the load speed of a compiled model.
+
+- `device_bindings.py`  
+  Demonstrates advanced device bindings, including running ONNX models on CPU or GPU, using ONNX Runtime's `OrtValue` for device memory, and direct inference with PyTorch tensors on the selected device. It also demonstrates how to interact with ORT using dlpack.
+
+Each sample is self-contained and includes comments explaining the main concepts.
+
+### Setup 
+
+Besides installing the ONNX Runtime package there are some other dependencies for the samples to work correctly. 
+Please pick your selected [onnxruntime package](https://onnxruntime.ai/docs/get-started/with-python.html#install-onnx-runtime) manually.
+```
+pip install -r requirements.txt
+# to install ORT GPU with required cuda dependencies
+pip install onnxruntime-gpu[cuda,cudnn]
+```
\ No newline at end of file
diff --git a/python/api/compile_api.py b/python/api/compile_api.py
new file mode 100644
index 000000000..408fafd0f
--- /dev/null
+++ b/python/api/compile_api.py
@@ -0,0 +1,117 @@
+import argparse
+import os
+import time
+import onnxruntime as ort
+
+# Set logger severity to warning level to reduce console output.
+ort.set_default_logger_severity(3)
+
+# Default Execution Provider for NVIDIA GPUs as requested.
+TRT_RTX_EP = "NvTensorRTRTXExecutionProvider"
+
+
+def compile(input_path, output_path, provider, ep_options, embed_mode=False):
+    """
+    Compiles an ONNX model for a specified execution provider and saves it.
+    
+    Args:
+        input_path (str): Path to the original ONNX model.
+        output_path (str): Path to save the compiled model.
+        provider (str): The name of the execution provider.
+        embed_mode (bool): If True, embeds the compiled binary data into the ONNX file.
+    """
+    # Remove the output file if it already exists to ensure a clean compilation.
+    if os.path.exists(output_path):
+        os.remove(output_path)
+        print(f"> Previous compiled model at {output_path} removed.")
+
+    # Create session options and add the provider.
+    session_options = ort.SessionOptions()
+    session_options.add_provider(provider, ep_options)
+
+    # Create a ModelCompiler instance using positional arguments.
+    model_compiler = ort.ModelCompiler(
+        session_options,
+        input_path,
+        embed_compiled_data_into_model=embed_mode
+    )
+
+    print(f"\n> Compiling model with '{provider}'...")
+    start = time.perf_counter()
+    # Execute the compilation process.
+    model_compiler.compile_to_file(output_path)
+    stop = time.perf_counter()
+
+    if os.path.exists(output_path):
+        print("> Compiled successfully!")
+        print(f"> Compile time: {stop - start:.3f} sec")
+        print(f"> Compiled model saved at {output_path}")
+
+
+def load_session(model_path, provider, ep_options):
+    """
+    Loads an ONNX model into an InferenceSession and measures the loading time.
+
+    Args:
+        model_path (str): Path to the ONNX model file.
+        provider (str): The name of the execution provider.
+        ep_options (dict): The execution provider options.
+    """
+    # Create the list of providers with an empty dictionary for options.
+
+    start = time.perf_counter()
+    # Load the model using the specified provider.
+    # session_options = ort.SessionOptions()
+    # session_options.add_provider(provider, ep_options)
+    # session = ort.InferenceSession(model_path, sess_options=session_options)
+    session = ort.InferenceSession(model_path, providers=[(provider, ep_options)])
+    stop = time.perf_counter()
+
+    print(f"> Session load time: {stop - start:.3f} sec")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compile ONNX model with ONNX Runtime")
+    parser.add_argument("-i", "--model_path", type=str, default=None, help="Path to the ONNX model file")
+    parser.add_argument("-o", "--output_path", type=str, default="model_ctx.onnx",
+                        help="Path to save the compiled EP context model")
+    parser.add_argument("-p", "--provider", default=TRT_RTX_EP, type=str, help="Execution Provider")
+    parser.add_argument("-popt", "--provider_options", default=[], type=str, nargs="+",
+                        help="Execution Provider options as key=value pairs")
+    # Using a type=bool for the embed flag.
+    parser.add_argument("--embed", action=argparse.BooleanOptionalAction, help="Binary data embedded within EP context node")
+    args = parser.parse_args()
+
+    if args.model_path is None:
+        from getting_started import create_model
+
+        args.model_path = create_model()
+    ep_options = {}
+    for kv_pair in args.provider_options:
+        key, value = kv_pair.split("=")
+        ep_options[key] = value
+
+    print(f"""
+    -----------------------------------------------
+    ONNX Runtime Model Compilation Script
+    -----------------------------------------------
+    "> Using Execution Provider: {args.provider}
+    "> Using Execution Provider options: {ep_options}
+    "> Embed Mode: {'Embedded' if args.embed else 'External'}
+    -----------------------------------------------
+    Available execution provider(s) {ort.get_available_providers()}
+    """)
+
+    # Load and time the original model.
+    print("\n> Loading regular onnx...")
+    load_session(args.model_path, args.provider, ep_options=ep_options)
+
+    # Compile the model.
+    compile(args.model_path, args.output_path, args.provider,
+            ep_options=ep_options, embed_mode=args.embed)
+
+    # Load and time the compiled model.
+    print("\n> Loading EP context model...")
+    load_session(args.output_path, args.provider, ep_options=ep_options)
+
+    print("\nProgram finished successfully.")
diff --git a/python/api/onnxruntime-python-api.py b/python/api/device_bindings.py
similarity index 51%
rename from python/api/onnxruntime-python-api.py
rename to python/api/device_bindings.py
index 4c90755fd..302095f99 100644
--- a/python/api/onnxruntime-python-api.py
+++ b/python/api/device_bindings.py
@@ -4,12 +4,15 @@
 
 import numpy as np
 import torch
+import os
+import re
 import onnxruntime
 
 MODEL_FILE = '.model.onnx'
 DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
 DEVICE_INDEX = 0     # Replace this with the index of the device you want to run on
 DEVICE=f'{DEVICE_NAME}:{DEVICE_INDEX}'
+LIB_EXT = 'so' if os.name != 'nt' else 'dll'
 
 # A simple model to calculate addition of two tensors
 def model():
@@ -32,30 +35,28 @@ def create_model(type: torch.dtype = torch.float32):
  
 # Create an ONNX Runtime session with the provided model
 def create_session(model: str) -> onnxruntime.InferenceSession:
+    available_providers = {device.ep_name for device in  onnxruntime.get_ep_devices()}
     providers = ['CPUExecutionProvider']
     if torch.cuda.is_available():
-        providers.insert(0, 'CUDAExecutionProvider')
+        if 'CUDAExecutionProvider' in available_providers:
+            providers.insert(0, 'CUDAExecutionProvider')
+        if 'NvTensorRTRTXExecutionProvider' in available_providers:
+            providers.insert(0, 'NvTensorRTRTXExecutionProvider')
     return onnxruntime.InferenceSession(model, providers=providers)
 
-# Run the model on CPU consuming and producing numpy arrays 
-def run(x: np.array, y: np.array) -> np.array:
-    session = create_session(MODEL_FILE)
-
-    z = session.run(["z"], {"x": x, "y": y})
-    
-    return z[0]   
 
 # Run the model on device consuming and producing ORTValues
 def run_with_data_on_device(x: np.array, y: np.array) -> onnxruntime.OrtValue:
     session = create_session(MODEL_FILE)
+    mem_info = session.get_input_memory_infos()[0]
 
-    x_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(x, DEVICE_NAME, DEVICE_INDEX)
-    y_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(y, DEVICE_NAME, DEVICE_INDEX)
+    x_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(x, 'gpu', device_id=mem_info.device_id, vendor_id=mem_info.device_vendor_id)
+    y_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(y, 'gpu', device_id=mem_info.device_id, vendor_id=mem_info.device_vendor_id)
 
     io_binding = session.io_binding()
-    io_binding.bind_input(name='x', device_type=x_ortvalue.device_name(), device_id=0, element_type=x.dtype, shape=x_ortvalue.shape(), buffer_ptr=x_ortvalue.data_ptr())
-    io_binding.bind_input(name='y', device_type=y_ortvalue.device_name(), device_id=0, element_type=y.dtype, shape=y_ortvalue.shape(), buffer_ptr=y_ortvalue.data_ptr())
-    io_binding.bind_output(name='z', device_type=DEVICE_NAME, device_id=DEVICE_INDEX, element_type=x.dtype, shape=x_ortvalue.shape())
+    io_binding.bind_input(name='x', device_type=x_ortvalue.device_name(), device_id=mem_info.device_id, element_type=x.dtype, shape=x_ortvalue.shape(), buffer_ptr=x_ortvalue.data_ptr())
+    io_binding.bind_input(name='y', device_type=y_ortvalue.device_name(), device_id=mem_info.device_id, element_type=y.dtype, shape=y_ortvalue.shape(), buffer_ptr=y_ortvalue.data_ptr())
+    io_binding.bind_output(name='z', device_type=x_ortvalue.device_name(), device_id=mem_info.device_id, element_type=x.dtype, shape=x_ortvalue.shape())
     session.run_with_iobinding(io_binding)
 
     z = io_binding.get_outputs()
@@ -63,8 +64,9 @@ def run_with_data_on_device(x: np.array, y: np.array) -> onnxruntime.OrtValue:
     return z[0]
 
 # Run the model on device consuming and producing native PyTorch tensors
-def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type: np.dtype = np.float32, torch_type: torch.dtype = torch.float32) -> torch.Tensor:
+def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type: np.dtype = np.float32, torch_type: torch.dtype = torch.float32, dlpack=False) -> torch.Tensor:
     session = create_session(MODEL_FILE)
+    mem_info = session.get_input_memory_infos()[0]
 
     binding = session.io_binding()
 
@@ -73,8 +75,8 @@ def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type:
 
     binding.bind_input(
         name='x',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
+        device_type="gpu",
+        device_id=mem_info.device_id,
         element_type=np_type,
         shape=tuple(x_tensor.shape),
         buffer_ptr=x_tensor.data_ptr(),
@@ -82,39 +84,60 @@ def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type:
 
     binding.bind_input(
         name='y',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
+        device_type="gpu",
+        device_id=mem_info.device_id,
         element_type=np_type,
         shape=tuple(y_tensor.shape),
         buffer_ptr=y_tensor.data_ptr(),
         )
-
-    ## Allocate the PyTorch tensor for the model output
-    z_tensor = torch.empty(x_tensor.shape, dtype=torch_type, device=DEVICE).contiguous()
-    binding.bind_output(
-        name='z',
-        device_type=DEVICE_NAME,
-        device_id=DEVICE_INDEX,
-        element_type=np_type,
-        shape=tuple(z_tensor.shape),
-        buffer_ptr=z_tensor.data_ptr(),
-    )
+    if dlpack:
+        binding.bind_output(
+            name='z',
+            device_type="gpu",
+        )
+    else:
+        ## Allocate the PyTorch tensor for the model output
+        z_tensor = torch.empty(x_tensor.shape, dtype=torch_type, device=DEVICE).contiguous()
+        binding.bind_output(
+            name='z',
+            device_type="gpu",
+            device_id=mem_info.device_id,
+            element_type=np_type,
+            shape=tuple(z_tensor.shape),
+            buffer_ptr=z_tensor.data_ptr(),
+        )
 
     session.run_with_iobinding(binding)
-
-    return z_tensor
+    if dlpack:
+        from onnxruntime.capi import _pybind_state as C
+        outputs = binding.get_outputs()
+        return torch.tensor(C.OrtValue.from_dlpack(outputs[0]._ortvalue.to_dlpack(), False))
+    else:
+        return z_tensor
 
 
 def main():
-    create_model()
+    # check if plugin based providers are available and register them
+    ort_capi_dir = os.path.dirname(onnxruntime.capi.__file__)
+    for p in  os.listdir(ort_capi_dir):
+        match = re.match(r".*onnxruntime_providers_(.*)\."+LIB_EXT, p)
+        if match is not None:
+            ep_name = match.group(1)
+            if ep_name == 'shared': continue
+            onnxruntime.register_execution_provider_library(ep_name, os.path.join(ort_capi_dir, p))
+            print(f"Registered execution provider {ep_name} with library: {p}")
 
-    print(run(x=np.float32([1.0, 2.0, 3.0]),y=np.float32([4.0, 5.0, 6.0])))
-    # [array([5., 7., 9.], dtype=float32)]
+    create_model()
 
     print(run_with_data_on_device(x=np.float32([1.0, 2.0, 3.0, 4.0, 5.0]), y=np.float32([1.0, 2.0, 3.0, 4.0, 5.0])).numpy())
     # [ 2.  4.  6.  8. 10.]
 
-    print(run_with_torch_tensors_on_device(torch.rand(5).to(DEVICE), torch.rand(5).to(DEVICE)))
+    x = torch.rand(5).to(DEVICE)
+    y = torch.rand(5).to(DEVICE)
+    print(run_with_torch_tensors_on_device(x, y, dlpack=True))
+    # tensor([0.7023, 1.3127, 1.7289, 0.3982, 0.8386])
+
+    print(run_with_torch_tensors_on_device(x, y, dlpack=False))
     # tensor([0.7023, 1.3127, 1.7289, 0.3982, 0.8386])
 
     create_model(torch.int64)
diff --git a/python/api/getting_started.py b/python/api/getting_started.py
new file mode 100644
index 000000000..66b201240
--- /dev/null
+++ b/python/api/getting_started.py
@@ -0,0 +1,54 @@
+# A set of code samples showing different usage of the ONNX Runtime Python API
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+import torch
+import onnxruntime
+
+MODEL_FILE = '.model.onnx'
+DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+# A simple model to calculate addition of two tensors
+def model():
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super(Model, self).__init__()
+
+        def forward(self, x, y):
+            return x.add(y)
+
+    return Model()
+
+# Create an instance of the model and export it to ONNX graph format, with dynamic size for the data
+def create_model(type: torch.dtype = torch.float32):
+    sample_x = torch.ones(3, dtype=type)
+    sample_y = torch.zeros(3, dtype=type)
+
+    torch.onnx.export(model(), (sample_x, sample_y), MODEL_FILE, input_names=["x", "y"], output_names=["z"],
+                      dynamic_axes={"x": {0 : "array_length_x"}, "y": {0: "array_length_y"}})
+    return MODEL_FILE
+
+# Create an ONNX Runtime session with the provided model
+def create_session(model: str) -> onnxruntime.InferenceSession:
+    providers = ['CPUExecutionProvider']
+    if torch.cuda.is_available():
+        providers.insert(0, 'CUDAExecutionProvider')
+    return onnxruntime.InferenceSession(model, providers=providers)
+
+# Run the model on CPU consuming and producing numpy arrays
+def run(x: np.array, y: np.array) -> np.array:
+    session = create_session(MODEL_FILE)
+
+    z = session.run(["z"], {"x": x, "y": y})
+
+    return z[0]
+
+def main():
+    create_model()
+
+    print(run(x=np.float32([1.0, 2.0, 3.0]),y=np.float32([4.0, 5.0, 6.0])))
+    # [array([5., 7., 9.], dtype=float32)]
+
+if __name__ == "__main__":
+    main()
diff --git a/python/api/requirements.txt b/python/api/requirements.txt
new file mode 100644
index 000000000..0d985a607
--- /dev/null
+++ b/python/api/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+torch
+onnx
+--extra-index-url https://download.pytorch.org/whl/cu128
diff --git a/python/models/stable_difusion/README.md b/python/models/stable_difusion/README.md
new file mode 100644
index 000000000..cbcc3910b
--- /dev/null
+++ b/python/models/stable_difusion/README.md
@@ -0,0 +1,77 @@
+# Stable Diffusion 3 Medium ONNX Export Guide
+
+This guide provides the steps to convert the `stabilityai/stable-diffusion-3-medium` model to the ONNX format for use with the CUDA execution provider. It also includes a step to address an issue with mixed-precision nodes that may occur during the conversion process.
+
+## 1. Prerequisites and Installation
+
+Install the required Python packages using the following `requirements.txt` content:
+
+```
+numpy
+torch --index-url https://download.pytorch.org/whl/cu121
+optimum[onnxruntime]
+onnxruntime-gpu
+diffusers
+sentencepiece
+transformers
+```
+
+You can save this to a `requirements.txt` file and install it with:
+```bash
+pip install -r requirements.txt
+```
+This will install `onnxruntime-gpu` with the CUDA execution provider, which is necessary for model conversion.
+
+## 2. Model Conversion
+
+Run the following command to export the model to ONNX format. This command uses `optimum-cli` to convert the model to half-precision (`fp16`) on a CUDA device.
+
+```bash
+optimum-cli export onnx --model stabilityai/stable-diffusion-3-medium --dtype fp16 --device cuda fp16_optimum
+```
+
+This will download the model and convert it into multiple ONNX files in the `fp16_optimum` directory.
+
+## 3. Correcting FP64 Nodes
+
+The PyTorch model may contain some `fp64` nodes, which are exported as-is during the conversion. If you encounter issues with these nodes, you can use the provided `Replace_fp64.py` script to replace them with `fp32` nodes. This script will process all `.onnx` files in the input directory and save the corrected files to the output directory.
+
+```bash
+python replace_fp64.py fp16_optimum corrected_model
+```
+This will create a `corrected_model` directory with the FP64 nodes converted to FP32.
+
+## 4. Using a Custom ONNX Runtime
+
+If you have a locally built ONNX Runtime wheel with specific optimizations (e.g., for NvTensorRTRTXExecutionProvider), ensure that you install it in your environment before running inference. Additionally, be sure to uninstall the default `onnxruntime` package installed via `requirements.txt` to avoid any conflicts.
+
+## 5. Running Inference
+
+To run inference with the converted ONNX model, use the provided `RunSd.py` script. This script loads the ONNX model and generates an image based on a prompt.
+
+Here is an example command to run the script:
+```bash
+python run_sd.py --model_path corrected_model --prompt "A beautiful landscape painting of a waterfall in a lush forest" --output_dir generated_images
+```
+
+### Command-line Arguments
+
+The `RunSd.py` script accepts several arguments to customize the image generation process:
+
+*   `--model_path`: Path to the directory containing the ONNX models (e.g., `corrected_model`). (Required)
+*   `--prompt`: The text prompt to generate the image from.
+*   `--negative_prompt`: The prompt not to guide the image generation.
+*   `--height`: The height of the generated image (default: 512).
+*   `--width`: The width of the generated image (default: 512).
+*   `--steps`: The number of inference steps (default: 50).
+*   `--guidance_scale`: Guidance scale for the prompt (default: 7.5).
+*   `--seed`: A seed for reproducibility.
+*   `--output_dir`: The directory to save the generated images (default: `generated_images`).
+*   `--execution_provider`: The ONNX Runtime execution provider to use (default: `NvTensorRTRTXExecutionProvider`).
+
+For a full list of arguments, you can run:
+```bash
+python run_sd.py --help
+```
+
+The generated image will be saved in the specified output directory. 
diff --git a/python/models/stable_difusion/replace_fp64.py b/python/models/stable_difusion/replace_fp64.py
new file mode 100644
index 000000000..dde7d70ed
--- /dev/null
+++ b/python/models/stable_difusion/replace_fp64.py
@@ -0,0 +1,117 @@
+import onnx
+from onnx import numpy_helper
+import numpy as np
+import argparse
+import os
+import shutil
+import logging
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def convert_fp64_to_fp32(model_path: str, output_path: str):
+    """
+    Loads an ONNX model, converts all float64 tensors and casts to float32,
+    and saves the modified model.
+    """
+    logging.info(f"Loading model from: {model_path}")
+    model = onnx.load(model_path)
+    
+    # 1. Convert all initializers from float64 to float32
+    converted_initializers = 0
+    new_initializers = []
+    for initializer in model.graph.initializer:
+        if initializer.data_type == onnx.TensorProto.DOUBLE:
+            initializer_np = numpy_helper.to_array(initializer)
+            initializer_fp32 = initializer_np.astype(np.float32)
+            new_initializer = numpy_helper.from_array(initializer_fp32, name=initializer.name)
+            new_initializers.append(new_initializer)
+            converted_initializers += 1
+        else:
+            new_initializers.append(initializer)
+
+    model.graph.ClearField("initializer")
+    model.graph.initializer.extend(new_initializers)
+    
+    if converted_initializers > 0:
+        logging.info(f"Converted {converted_initializers} initializers from FP64 to FP32.")
+
+    # 2. Convert nodes
+    converted_casts = 0
+    converted_constants = 0
+    for node in model.graph.node:
+        if node.op_type == 'Constant':
+            for attr in node.attribute:
+                if attr.name == 'value' and attr.t.data_type == onnx.TensorProto.DOUBLE:
+                    attr.t.data_type = onnx.TensorProto.FLOAT
+                    fp64_array = np.frombuffer(attr.t.raw_data, dtype=np.float64)
+                    fp32_array = fp64_array.astype(np.float32)
+                    attr.t.raw_data = fp32_array.tobytes()
+                    converted_constants += 1
+        elif node.op_type == 'Cast':
+            for attr in node.attribute:
+                if attr.name == 'to' and attr.i == onnx.TensorProto.DOUBLE:
+                    attr.i = onnx.TensorProto.FLOAT
+                    converted_casts += 1
+    
+    if converted_casts > 0:
+        logging.info(f"Modified {converted_casts} Cast operators from FP64 to FP32.")
+    if converted_constants > 0:
+        logging.info(f"Modified {converted_constants} Constant operators from FP64 to FP32.")
+        
+    # 3. Convert all graph inputs, outputs, and value_info from float64 to float32
+    converted_tensors = 0
+    for tensor in list(model.graph.value_info) + list(model.graph.input) + list(model.graph.output):
+        if tensor.type.tensor_type.elem_type == onnx.TensorProto.DOUBLE:
+            tensor.type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+            converted_tensors += 1
+    
+    if converted_tensors > 0:
+        logging.info(f"Converted {converted_tensors} tensor definitions from FP64 to FP32.")
+    
+    # 4. Save the modified model
+    logging.info(f"Saving modified model to: {output_path}")
+    onnx.save(model, output_path, save_as_external_data=True)
+    logging.info("Conversion complete.")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Convert ONNX models in a directory from float64 to float32 precision."
+    )
+    parser.add_argument("input_dir", type=str, help="Directory containing the input ONNX models.")
+    parser.add_argument("output_dir", type=str, help="Directory where the converted models will be saved.")
+    args = parser.parse_args()
+    
+    input_dir = args.input_dir
+    output_dir = args.output_dir
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        logging.info(f"Created output directory: {output_dir}")
+
+    for root, _, files in os.walk(input_dir):
+        # Replicate directory structure in the output directory
+        relative_path = os.path.relpath(root, input_dir)
+        output_subdir = os.path.join(output_dir, relative_path)
+        if not os.path.exists(output_subdir):
+            os.makedirs(output_subdir)
+
+        for filename in files:
+            input_path = os.path.join(root, filename)
+            output_path = os.path.join(output_subdir, filename)
+
+            if filename.endswith(".onnx"): 
+                logging.info("-" * 50)
+                logging.info(f"Processing ONNX file: {input_path}")
+                try:
+                    convert_fp64_to_fp32(input_path, output_path)
+                except Exception as e:
+                    logging.error(f"Failed to convert {input_path}: {e}")
+                logging.info("-" * 50)
+            elif filename.endswith(".onnx_data"):
+                # Skip copying .onnx_data files as new ones will be created on save
+                continue
+            else:
+                logging.info(f"Copying file: {input_path} to {output_path}")
+                shutil.copy2(input_path, output_path)
\ No newline at end of file
diff --git a/python/models/stable_difusion/requirements.txt b/python/models/stable_difusion/requirements.txt
new file mode 100644
index 000000000..7909b7ac4
--- /dev/null
+++ b/python/models/stable_difusion/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+torch
+--index-url https://download.pytorch.org/whl/cu129
+optimum[onnxruntime]
+onnxruntime-gpu
+diffusers
+sentencepiece 
\ No newline at end of file
diff --git a/python/models/stable_difusion/run_sd.py b/python/models/stable_difusion/run_sd.py
new file mode 100644
index 000000000..92ff27210
--- /dev/null
+++ b/python/models/stable_difusion/run_sd.py
@@ -0,0 +1,213 @@
+"""
+This script runs the Stable Diffusion 3 ONNX pipeline for image generation.
+
+For detailed setup instructions, test procedures, and verification steps, 
+please refer to the QA_Test_Plan.md file included in the project.
+
+Command-line arguments:
+  --model_path: Path to the ONNX model directory. (Required)
+  --prompt: The prompt for image generation.
+  --height: The height of the generated image.
+  --width: The width of the generated image.
+  --steps: Number of inference steps.
+  --num_iterations: Number of times to run the inference loop.
+  --output_dir: Directory to save generated images.
+  --negative_prompt: The prompt not to guide the image generation.
+  --guidance_scale: Higher guidance scale encourages to generate images that are closely linked to the text prompt.
+  --seed: The seed for reproducibility.
+  --execution_provider: The execution provider to use for ONNX Runtime.
+"""
+from pathlib import Path
+import onnxruntime as ort
+import torch
+import numpy as np
+import argparse
+from optimum.onnxruntime import ORTStableDiffusion3Pipeline
+
+
+
+class OrtWrapper(ort.InferenceSession):
+    def __init__(self, onnx_path,  session_options, provider, provider_options={}):
+
+        session_options.add_session_config_entry("session.use_env_allocators", "1")
+        super().__init__(onnx_path,
+                         sess_options=session_options,
+                         providers=[provider],
+                         provider_options=[provider_options])
+        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.get_outputs())}
+
+
+def get_transformer(model_root, batch_size, height, width, provider):
+    config_path = Path(model_root) / "transformer"
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", 2*batch_size)
+    session_options.add_free_dimension_override_by_name("height", height//8)
+    session_options.add_free_dimension_override_by_name("width", width//8)
+
+    
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+def get_text_encoder(model_root, batch_size, provider):
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", batch_size)
+    session_options.add_free_dimension_override_by_name("sequence_length", 77)
+    config_path = Path(model_root) / "text_encoder"
+    
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+def get_vae_encoder(model_root, batch_size, height, width, provider):
+    config_path = Path(model_root) / "vae_encoder"
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", batch_size)
+    session_options.add_free_dimension_override_by_name("sample_height", height)
+    session_options.add_free_dimension_override_by_name("sample_width", width)
+    
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+def get_vae_decoder(model_root, batch_size, height, width, provider):
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", batch_size)
+    session_options.add_free_dimension_override_by_name("latent_height", height//8)
+    session_options.add_free_dimension_override_by_name("latent_width", width//8)
+    config_path = Path(model_root) / "vae_decoder"
+    
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+def get_text_encoder_2(model_root, batch_size, provider):
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", batch_size)
+    session_options.add_free_dimension_override_by_name("sequence_length", 77)
+    config_path = Path(model_root) / "text_encoder_2"
+    
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+def get_text_encoder_3(model_root, batch_size, provider):
+    session_options=ort.SessionOptions()
+    session_options.add_free_dimension_override_by_name("batch_size", batch_size)
+    session_options.add_free_dimension_override_by_name("sequence_length", 77)
+    config_path = Path(model_root) / "text_encoder_3"
+   
+    return OrtWrapper(
+        onnx_path=str(config_path / "model.onnx"),
+        session_options=session_options,
+        provider=provider)
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Stable Diffusion 3 ONNX pipeline.")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the ONNX model directory.")
+    parser.add_argument("--prompt", type=str, default="close up view of colorful chameleon", help="The prompt for image generation.")
+    parser.add_argument("--height", type=int, default=512, help="The height of the generated image.")
+    parser.add_argument("--width", type=int, default=512, help="The width of the generated image.")
+    parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
+    parser.add_argument("--num_iterations", type=int, default=1, help="Number of times to run the inference loop.")
+    parser.add_argument("--output_dir", type=str, default="generated_images", help="Directory to save generated images.")
+    parser.add_argument("--negative_prompt", type=str, default=None, help="The prompt not to guide the image generation.")
+    parser.add_argument("--guidance_scale", type=float, default=7.5, help="Higher guidance scale encourages to generate images that are closely linked to the text prompt.")
+    parser.add_argument("--seed", type=int, default=None, help="The seed for reproducibility.")
+    parser.add_argument("--execution_provider", type=str, default="NvTensorRTRTXExecutionProvider", help="The execution provider to use for ONNX Runtime.")
+    
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+
+    print("Loading models...")
+    vae_decoder = get_vae_decoder(args.model_path, 1, args.height, args.width, args.execution_provider)
+    transformer = get_transformer(args.model_path, 1, args.height, args.width, args.execution_provider)
+    vae_encoder = get_vae_encoder(args.model_path, 1, args.height, args.width, args.execution_provider)
+    text_encoder = get_text_encoder(args.model_path, 1, args.execution_provider)
+    text_encoder_2 = get_text_encoder_2(args.model_path, 1, args.execution_provider)
+    text_encoder_3 = get_text_encoder_3(args.model_path, 1, args.execution_provider)
+    print("Models loaded.")
+
+
+    print("Creating pipeline...")
+    pipeline = ORTStableDiffusion3Pipeline.from_pretrained(
+                    args.model_path,
+                    use_io_binding=True,  # Not supported by Optimum version 1.17.1 at the time of verification.
+                    transformer_session=transformer,
+                    text_encoder_session=text_encoder,
+                    text_encoder_2_session=text_encoder_2,
+                    text_encoder_3_session=text_encoder_3,
+                    vae_encoder_session=vae_encoder,
+                    vae_decoder_session=vae_decoder,
+                )
+    print("Pipeline created.")
+
+
+    print("Warmup iteration...")
+    images = pipeline(
+                prompt=[args.prompt]*1,
+                height=args.height,
+                width=args.width,
+                num_inference_steps=10,
+                negative_prompt=args.negative_prompt,
+                guidance_scale=args.guidance_scale,
+                max_sequence_length=77
+            ).images
+    print("Warmup finished.")
+
+
+
+    inference_times = []
+    all_images = []
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    print(f"Running inference for {args.num_iterations} iterations...")
+    for i in range(args.num_iterations):
+        print(f"Iteration {i+1}/{args.num_iterations}")
+        start_event.record(stream=torch.cuda.default_stream())
+        result = pipeline(
+            prompt=[args.prompt]*1,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.steps,
+            negative_prompt=args.negative_prompt,
+            guidance_scale=args.guidance_scale,
+            max_sequence_length=77
+        )
+        end_event.record(stream=torch.cuda.default_stream())
+        end_event.synchronize()
+        elapsed_time = start_event.elapsed_time(end_event)
+        inference_times.append(start_event.elapsed_time(end_event))
+        all_images.extend(result.images)
+    print("Inference finished.")
+
+    # Save generated images to disk
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(exist_ok=True)
+    print(f"Saving images to {output_dir}...")
+
+    for idx, img in enumerate(all_images):
+        img_path = output_dir / f"generated_image_{idx+1}.png"
+        img.save(img_path)
+        print(f"Saved image {idx+1} to {img_path}")
+    print("Images saved.")
+
+    total_time = sum(inference_times)
+    print("\n--- Performance ---")
+    print(f"Total pipeline execution for {args.num_iterations} inferences took {total_time:.2f} ms")
+    if args.num_iterations > 0:
+        print(f"Average time per inference: {total_time / args.num_iterations:.2f} ms")
+        print(f"Median time per inference: {np.median(inference_times):.2f} ms")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file