diff --git a/MY_BUILD/ZERO_CHECK.proj b/MY_BUILD/ZERO_CHECK.proj
new file mode 100644
index 00000000..eeaea731
--- /dev/null
+++ b/MY_BUILD/ZERO_CHECK.proj
@@ -0,0 +1,11 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="16.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Target Name="Build" Inputs="C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\7038008a80b7806c9e4bb49bc892b370\generate.stamp.rule;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeCInformation.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeCXXInformation.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeCommonLanguageInclude.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeGenericSystem.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeInitializeConfigs.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeLanguageInformation.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeRCInformation.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeSystemSpecificInformation.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\CMakeSystemSpecificInitialize.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Compiler\CMakeCommonCompilerMacros.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Compiler\MSVC-C.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Compiler\MSVC-CXX.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Compiler\MSVC.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindCUDA.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindCUDA\run_nvcc.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindCUDA\select_compute_arch.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindOpenGL.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindPackageHandleStandardArgs.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\FindPackageMessage.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Platform\Windows-MSVC-C.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Platform\Windows-MSVC-CXX.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Platform\Windows-MSVC.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Platform\Windows.cmake;C:\Program Files\CMake\share\cmake-3.24\Modules\Platform\WindowsPaths.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\CMakeLists.txt;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\3.24.0-rc3\CMakeCCompiler.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\3.24.0-rc3\CMakeCXXCompiler.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\3.24.0-rc3\CMakeRCCompiler.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\3.24.0-rc3\CMakeSystem.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\cis565_path_tracer.dir\src\cis565_path_tracer_generated_pathtrace.cu.obj.depend;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\cmake\CUDAComputesList.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\cmake\FindGLEW.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\cmake\FindGLFW.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\cmake\FindGLM.cmake;C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\cmake\cuda_compute_capability.cpp" Outputs="C:\Users\ursai\Desktop\565\Project3-CUDA-Path-Tracer\MY_BUILD\CMakeFiles\generate.stamp">
+    <Message Text="Checking Build System" />
+    <Exec Command="setlocal&#10;&quot;C:\Program Files\CMake\bin\cmake.exe&quot; -SC:/Users/ursai/Desktop/565/Project3-CUDA-Path-Tracer -BC:/Users/ursai/Desktop/565/Project3-CUDA-Path-Tracer/MY_BUILD --check-stamp-list CMakeFiles/generate.stamp.list --vs-solution-file C:/Users/ursai/Desktop/565/Project3-CUDA-Path-Tracer/MY_BUILD/cis565_path_tracer.sln&#10;if %errorlevel% neq 0 goto :cmEnd&#10;:cmEnd&#10;endlocal &amp; call :cmErrorLevel %errorlevel% &amp; goto :cmDone&#10;:cmErrorLevel&#10;exit /b %1&#10;:cmDone&#10;if %errorlevel% neq 0 goto :VCEnd&#10;:VCEnd" />
+  </Target>
+  <Target Name="Clean" />
+  <Target Name="GetTargetPath" />
+  <Target Name="GetNativeManifest" />
+  <Target Name="GetCopyToOutputDirectoryItems" />
+</Project>
\ No newline at end of file
diff --git a/MY_BUILD/cornell.2022-12-02_13-04-25z.46samp.png b/MY_BUILD/cornell.2022-12-02_13-04-25z.46samp.png
new file mode 100644
index 00000000..35c0864b
Binary files /dev/null and b/MY_BUILD/cornell.2022-12-02_13-04-25z.46samp.png differ
diff --git a/MY_BUILD/imgui.ini b/MY_BUILD/imgui.ini
new file mode 100644
index 00000000..b67f522b
--- /dev/null
+++ b/MY_BUILD/imgui.ini
@@ -0,0 +1,10 @@
+[Window][Debug##Default]
+Pos=60,60
+Size=400,400
+Collapsed=0
+
+[Window][Path Tracer Analytics]
+Pos=60,60
+Size=339,65
+Collapsed=0
+
diff --git a/README.md b/README.md
index 110697ce..23606549 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,130 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shixuan Fang  
+  - [LinkedIn](https://www.linkedin.com/in/shixuan-fang-4aba78222/)
+* Tested on: Windows 11, i7-12700kf, RTX3080Ti (Personal)
 
-### (TODO: Your README)
+<p align="center">
+  <img src="/img/final_picture.png" width="400" height="400">
+</p>
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+## Project Overview
 
+In this project, I implemented a CUDA-based path tracer, which is very different from CPU path tracer. Pathtracing in CPU is used to be recursive, but since CUDA doesn't support recursion(or very slow on new GPUs), this path tracer uses a iterative way.
+<p align="center">
+  <img style="float: right;" src="https://user-images.githubusercontent.com/54868517/194418589-a882f9be-abda-4ae0-afe7-5b39bb03791e.png" width="500" height="200">
+</p>
+
+Also, traditional CPU pathtracing is down pixel by pixel, but as the figure shows, doing the same algorithm(parallel by pixel) on GPU will cause different threads finish in different time and therefore cause many divergence. The solution, then, is to parallel by rays. We launch a kernel that traces **one** bounce for every ray from a ray pool, update the result, and then terminate rays with stream compaction
+
+
+## Core Features
+
+### 1. Basic shading
+
+There are three basic shadings implemented --- pure diffuse, pure reflection, and both reflection and refraction. Diffuse is done by generating a new direction randomly across the hemisphere, reflection is done by reflecting the incident ray direction by the surface normal, and refraction is done by using Schlick's approximation to create Fresnel effect.
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194421652-410c82f6-60c2-4ca4-8200-2039355fc622.png" width="300" height="300">
+</p>
+
+### 2. Acceleration
+
+- Stream Compaction
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194587050-ec6e6061-d54c-47e0-a152-7444923fd8e4.png" width="550" height="300"/>
+</p>
+
+The first acceleration is to remove unnecessary rays that are terminated(remaining bounce = 0) by using Stream Compaction. In this project I used ```thrust::stable_partition``` to compact. As seen in the chart, stream compaction will give noticeable performance improvement, especially with higher object number. This is because there will be many rays shooting outside the scene and then be terminated.
+
+- Sort rays by material type
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194594883-13248d7a-6478-4733-9ef8-01ec234ad0ea.png" width="550" height="300"/>
+</p>
+
+The second acceleration is to sort all rays by their material types, which can result in good memory access pattern for CUDA. In this project I used ```thrust::sort_by_key``` to sort. However, the result isn't as I expect, as sorting the material actually slow down the performance. I assume that sorting materials takes much longer than performance gain through memory coalescing.
+
+
+- Cache the first bounce for re-use
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194600817-b4a46c4e-c241-4f3e-a6eb-920904a6a3b2.png" width="550" height="300"/>
+  
+</p>
+
+The third acceleration is to cache the first bounce intersection data, therefore all other iterations can use the same data and save some performance. As seen in the graph, with different max ray depth there is always approximately 4 second performance inprovement.
+
+### 3. Microfacet shading model
+
+Microfacet shading model is a very popular pbr model, which works by statistically modeling the scattering of light from a large collection of microfacets. This model is based on Cook-Torrance Reflection Model, which is showed as the following equation.
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194615898-b098f124-5801-405e-81b0-9f7c3949de12.png">
+</p>
+
+In this equation, D represents the distribution of all microfacets' normal distribution, G is the geometry term which discribes the shadowing and masking effects of the surface, and F is the Fresnel term which describes how many light are reflected. There are many ways to describe these three terms, and in this project I choose GGX approximation for normal distribution function, Smith's shadowing and masking term as approximation for G function, and Schlick's approximation for F function. In these functions, there are **2** important factors, **Metalness** and **Roughness**. The following two figures demonstrate how these two factors affect the result. In the left, all three bunnys have the same roughness but different metalness, and from right to left the bunny looks more like metal. In the right, all three bunnys have same metalness but different roughness, and from left to right the bunny reflects less light and therefore looks darker.
+
+<p align="center">
+  <img src="img/diff_metalness.png" width="300" height="300"/>
+  <img src="img/diff_roughness.png" width="300" height="300"/> 
+</p>
+
+However, in this model the energy is not conservative, which is because this model doesn't consider multiple bounces, shown in the following figure(from [Kulla](https://blog.selfshadow.com/publications/s2017-shading-course/))
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194618245-64e9fdaf-9901-4304-b361-8a455df197d2.png"  width="600" height="250"/>
+</p>
+
+This can be solved using Kulla-Conty Approximation, but since it's a little complicated and needs pre-computation, I didn't implemented in this project.
+
+### 4. Mesh loading with tinyObj
+
+In order to test my microfacet shading model, I need to add more complex meshes and decided to use tinyObj to load ```OBJ``` format meshes.  In order to speed up ray-mesh intersetion, I created a toggleable AABB bounding box for intersection culling.
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194611878-93621d64-c496-41a8-a1d8-5f08f3eb3ef0.png" width="550" height="300"/>
+</p>
+
+As seen in the chart, with AABB mesh culling there is some performance improvement, but it's not that huge as I expect. I think it's because the bounding box is very small anyway, so there isn't too much improvement. 
+
+### 5. Stochastic Sampled Antialiasing
+
+According to [Stochastic Sampling](https://web.cs.wpi.edu/~matt/courses/cs563/talks/antialiasing/stochas.html), in stochastic sampling, every point has a finite probability of being hit. Stochastic sampling is extremely powerful as the visual system is much more sensitive to aliases than noise. The one that I choose to implement is Jittered, which is done by perturbing the position that camera shoot rays.
+
+**Left: no antialiasing**                  
+**right: with antialiasing**
+
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194441398-835325e3-cfa5-4f2b-9d6f-3d99b6948547.png" width="300" height="300"/>
+  <img src="https://user-images.githubusercontent.com/54868517/194441429-c3a3363c-28a6-4d64-b558-8e3af246329d.png" width="300" height="300"/> 
+</p>
+
+### 6. Physically-based Depth-of-field
+
+In regular pathtracing, we assume that the camera is an ideal pinhole camera, which is not physically possible. In order to create a physically-based camera, a classic way is the **thin lens approximation**. The algorithm I choose to implement is adapted from [PBRT[6.2.3]](https://www.pbr-book.org/3ed-2018/Camera_Models/Projective_Camera_Models), and the idea is quite simple. We set another 2 parameters for the camera, which are **lensRadius** and **focalDistance**. We sample on a concentric disk based on the lensRadius, and then jitter the camera ray based on that sample position and focalDistance. This simple algorithem can actually create very beautiful results.
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/54868517/194445204-bee8a91c-3f3f-4e00-b104-3a88ff2d1b5e.png" width="300" height="300"/>
+  <img src="https://user-images.githubusercontent.com/54868517/194445206-c30230a2-3a20-41a0-856f-3ac8a5a502ae.png" width="300" height="300"/> 
+  <img src="https://user-images.githubusercontent.com/54868517/194445205-6e306d7c-6ef7-45c5-a879-7495620d6a27.png" width="300" height="300"/> 
+</p>
+
+### 7. Direct Lighting
+
+This is done by taking the final ray(remaining bounce = 1) to a random point on an emissive object.
+It's a little hard to tell the difference, but the left one has no direct lighting and the right one has direct lighting.
+The right one is slightly brighter that the left one, and it also converges faster
+
+<p align="center">
+  <img src="img/no_anti_alias.png" width="300" height="300"/>
+  <img src="img/direct_lighting.png" width="300" height="300"/> 
+</p>
+
+## References
+
+- https://www.pbr-book.org/3ed-2018/Reflection_Models/Microfacet_Models
+- https://blog.selfshadow.com/publications/s2017-shading-course/
+- https://www.pbr-book.org/3ed-2018/Camera_Models/Environment_Camera
+- https://web.cs.wpi.edu/~matt/courses/cs563/talks/antialiasing/stochas.html
diff --git a/external/include/cornell_box.mtl b/external/include/cornell_box.mtl
new file mode 100644
index 00000000..d3a1c7a6
--- /dev/null
+++ b/external/include/cornell_box.mtl
@@ -0,0 +1,24 @@
+newmtl white
+Ka 0 0 0
+Kd 1 1 1
+Ks 0 0 0
+
+newmtl red
+Ka 0 0 0
+Kd 1 0 0
+Ks 0 0 0
+
+newmtl green
+Ka 0 0 0
+Kd 0 1 0
+Ks 0 0 0
+
+newmtl blue
+Ka 0 0 0
+Kd 0 0 1
+Ks 0 0 0
+
+newmtl light
+Ka 20 20 20
+Kd 1 1 1
+Ks 0 0 0
diff --git a/external/include/stb_image.h b/external/include/stb_image.h
index b9b265fa..8b710609 100644
--- a/external/include/stb_image.h
+++ b/external/include/stb_image.h
@@ -1,5 +1,5 @@
-/* stb_image - v2.06 - public domain image loader - http://nothings.org/stb_image.h
-                                     no warranty implied; use at your own risk
+/* stb_image - v2.21 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
@@ -21,17 +21,20 @@
           avoid problematic images and only need the trivial interface
 
       JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+      PNG 1/2/4/8/16-bit-per-channel
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels)
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
 
       GIF (*comp always reports as 4-channel)
       HDR (radiance rgbE format)
       PIC (Softimage PIC)
       PNM (PPM and PGM binary only)
 
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
       - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
       - decode from arbitrary I/O callbacks
       - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
@@ -39,177 +42,68 @@
    Full documentation under "DOCUMENTATION" below.
 
 
-   Revision 2.00 release notes:
-
-      - Progressive JPEG is now supported.
-
-      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
-
-      - x86 platforms now make use of SSE2 SIMD instructions for
-        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
-        This work was done by Fabian "ryg" Giesen. SSE2 is used by
-        default, but NEON must be enabled explicitly; see docs.
-
-        With other JPEG optimizations included in this version, we see
-        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
-        on a JPEG on an ARM machine, relative to previous versions of this
-        library. The same results will not obtain for all JPGs and for all
-        x86/ARM machines. (Note that progressive JPEGs are significantly
-        slower to decode than regular JPEGs.) This doesn't mean that this
-        is the fastest JPEG decoder in the land; rather, it brings it
-        closer to parity with standard libraries. If you want the fastest
-        decode, look elsewhere. (See "Philosophy" section of docs below.)
-
-        See final bullet items below for more info on SIMD.
-
-      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
-        the memory allocator. Unlike other STBI libraries, these macros don't
-        support a context parameter, so if you need to pass a context in to
-        the allocator, you'll have to store it in a global or a thread-local
-        variable.
-
-      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
-        STBI_NO_LINEAR.
-            STBI_NO_HDR:     suppress implementation of .hdr reader format
-            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
-
-      - You can suppress implementation of any of the decoders to reduce
-        your code footprint by #defining one or more of the following
-        symbols before creating the implementation.
-
-            STBI_NO_JPEG
-            STBI_NO_PNG
-            STBI_NO_BMP
-            STBI_NO_PSD
-            STBI_NO_TGA
-            STBI_NO_GIF
-            STBI_NO_HDR
-            STBI_NO_PIC
-            STBI_NO_PNM   (.ppm and .pgm)
-
-      - You can request *only* certain decoders and suppress all other ones
-        (this will be more forward-compatible, as addition of new decoders
-        doesn't require you to disable them explicitly):
-
-            STBI_ONLY_JPEG
-            STBI_ONLY_PNG
-            STBI_ONLY_BMP
-            STBI_ONLY_PSD
-            STBI_ONLY_TGA
-            STBI_ONLY_GIF
-            STBI_ONLY_HDR
-            STBI_ONLY_PIC
-            STBI_ONLY_PNM   (.ppm and .pgm)
-
-         Note that you can define multiples of these, and you will get all
-         of them ("only x" and "only y" is interpreted to mean "only x&y").
-
-       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-
-      - Compilation of all SIMD code can be suppressed with
-            #define STBI_NO_SIMD
-        It should not be necessary to disable SIMD unless you have issues
-        compiling (e.g. using an x86 compiler which doesn't support SSE
-        intrinsics or that doesn't support the method used to detect
-        SSE2 support at run-time), and even those can be reported as
-        bugs so I can refine the built-in compile-time checking to be
-        smarter.
-
-      - The old STBI_SIMD system which allowed installing a user-defined
-        IDCT etc. has been removed. If you need this, don't upgrade. My
-        assumption is that almost nobody was doing this, and those who
-        were will find the built-in SIMD more satisfactory anyway.
-
-      - RGB values computed for JPEG images are slightly different from
-        previous versions of stb_image. (This is due to using less
-        integer precision in SIMD.) The C code has been adjusted so
-        that the same RGB values will be computed regardless of whether
-        SIMD support is available, so your app should always produce
-        consistent results. But these results are slightly different from
-        previous versions. (Specifically, about 3% of available YCbCr values
-        will compute different RGB results from pre-1.49 versions by +-1;
-        most of the deviating values are one smaller in the G channel.)
-
-      - If you must produce consistent results with previous versions of
-        stb_image, #define STBI_JPEG_OLD and you will get the same results
-        you used to; however, you will not get the SIMD speedups for
-        the YCbCr-to-RGB conversion step (although you should still see
-        significant JPEG speedup from the other changes).
-
-        Please note that STBI_JPEG_OLD is a temporary feature; it will be
-        removed in future versions of the library. It is only intended for
-        near-term back-compatibility use.
-
-
-   Latest revision history:
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) additional corruption checking
-                         stbi_set_flip_vertically_on_load
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
-                         progressive JPEG
-                         PGM/PPM support
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         STBI_NO_*, STBI_ONLY_*
-                         GIF bugfix
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support (both grayscale and paletted)
-                         optimize PNG
-                         fix bug in interlaced PNG with user-specified channel count
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
 
    See end of file for full revision history.
 
 
  ============================    Contributors    =========================
 
- Image formats                                Bug fixes & warning fixes
-    Sean Barrett (jpeg, png, bmp)                Marc LeBlanc
-    Nicolas Schulz (hdr, psd)                    Christpher Lloyd
-    Jonathan Dummer (tga)                        Dave Moore
-    Jean-Marc Lienher (gif)                      Won Chun
-    Tom Seddon (pic)                             the Horde3D community
-    Thatcher Ulrich (psd)                        Janez Zemva
-    Ken Miller (pgm, ppm)                        Jonathan Blow
-                                                 Laurent Gomila
-                                                 Aruelien Pocheville
- Extensions, features                            Ryamond Barbiero
-    Jetro Lauha (stbi_info)                      David Woo
-    Martin "SpartanJ" Golini (stbi_info)         Martin Golini
-    James "moose2000" Brown (iPhone PNG)         Roy Eltham
-    Ben "Disch" Wenger (io callbacks)            Luke Graham
-    Omar Cornut (1/2/4-bit PNG)                  Thomas Ruf
-    Nicolas Guillemot (vertical flip)            John Bartholomew
-                                                 Ken Hamada
- Optimizations & bugfixes                        Cort Stratton
-    Fabian "ryg" Giesen                          Blazej Dariusz Roszkowski
-    Arseny Kapoulkine                            Thibault Reuille
-                                                 Paul Du Bois
-                                                 Guillaume George
-  If your name should be here but                Jerry Jansson
-  isn't, let Sean know.                          Hayaki Saito
-                                                 Johan Duparc
-                                                 Ronny Chevalier
-                                                 Michal Cichon
-                                                 Tero Hanninen
-                                                 Sergio Gonzalez
-                                                 Cass Everitt
-                                                 Engin Manap
-                                                 Martins Mozeiko
-                                                 Joseph Thomson
-                                                 Phil Jordan
-
-LICENSE
-
-This software is in the public domain. Where that dedication is not
-recognized, you are granted a perpetual, irrevocable license to copy,
-distribute, and modify this file as you see fit.
-
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
+    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
+    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
+    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
+    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
+    Christian Floisand      Kevin Schmidt      JR Smith           github:darealshinji
+    Blazej Dariusz Roszkowski                                     github:Michaelangel007
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -218,10 +112,8 @@ distribute, and modify this file as you see fit.
 // DOCUMENTATION
 //
 // Limitations:
-//    - no 16-bit-per-channel PNG
 //    - no 12-bit-per-channel JPEG
 //    - no JPEGs with arithmetic coding
-//    - no 1-bit BMP
 //    - GIF always returns *comp=4
 //
 // Basic usage (see HDR discussion below for HDR usage):
@@ -234,10 +126,10 @@ distribute, and modify this file as you see fit.
 //    stbi_image_free(data)
 //
 // Standard parameters:
-//    int *x       -- outputs image width in pixels
-//    int *y       -- outputs image height in pixels
-//    int *comp    -- outputs # of image components in image file
-//    int req_comp -- if non-zero, # of image components requested in result
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
 //
 // The return value from an image loader is an 'unsigned char *' which points
 // to the pixel data, or NULL on an allocation failure or if the image is
@@ -245,11 +137,12 @@ distribute, and modify this file as you see fit.
 // with each pixel consisting of N interleaved 8-bit components; the first
 // pixel pointed to is top-left-most in the image. There is no padding between
 // image scanlines or between pixels, regardless of format. The number of
-// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
-// If req_comp is non-zero, *comp has the number of components that _would_
-// have been output otherwise. E.g. if you set req_comp to 4, you will always
-// get RGBA output, but you can check *comp to see if it's trivially opaque
-// because e.g. there were only 3 channels in the source image.
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
 //
 // An output image with N components has the following components interleaved
 // in this order in each pixel:
@@ -261,16 +154,26 @@ distribute, and modify this file as you see fit.
 //       4           red, green, blue, alpha
 //
 // If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
-// can be queried for an extremely brief, end-user unfriendly explanation
-// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
-// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
 // more user-friendly ones.
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
 // ===========================================================================
 //
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
 // Philosophy
 //
 // stb libraries are designed with the following priorities:
@@ -281,15 +184,15 @@ distribute, and modify this file as you see fit.
 //
 // Sometimes I let "good performance" creep up in priority over "easy to maintain",
 // and for best performance I may provide less-easy-to-use APIs that give higher
-// performance, in addition to the easy to use ones. Nevertheless, it's important
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
 // to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
 //
 // Some secondary priorities arise directly from the first two, some of which
-// make more explicit reasons why performance can't be emphasized.
+// provide more explicit reasons why performance can't be emphasized.
 //
 //    - Portable ("ease of use")
-//    - Small footprint ("easy to maintain")
+//    - Small source code footprint ("easy to maintain")
 //    - No dependencies ("ease of use")
 //
 // ===========================================================================
@@ -321,13 +224,6 @@ distribute, and modify this file as you see fit.
 // (at least this is true for iOS and Android). Therefore, the NEON support is
 // toggled by a build flag: define STBI_NEON to get NEON loops.
 //
-// The output of the JPEG decoder is slightly different from versions where
-// SIMD support was introduced (that is, for versions before 1.49). The
-// difference is only +-1 in the 8-bit RGB channels, and only on a small
-// fraction of pixels. You can force the pre-1.49 behavior by defining
-// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
-// and hence cost some performance.
-//
 // If for some reason you do not want to use any of SIMD code, or if
 // you have issues compiling it, you can disable it entirely by
 // defining STBI_NO_SIMD.
@@ -336,11 +232,10 @@ distribute, and modify this file as you see fit.
 //
 // HDR image support   (disable by defining STBI_NO_HDR)
 //
-// stb_image now supports loading HDR images in general, and currently
-// the Radiance .HDR file format, although the support is provided
-// generically. You can still load any file through the existing interface;
-// if you attempt to load an HDR file, it will be automatically remapped to
-// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
 // both of these constants can be reconfigured through this interface:
 //
 //     stbi_hdr_to_ldr_gamma(2.2f);
@@ -374,7 +269,7 @@ distribute, and modify this file as you see fit.
 //
 // By default we convert iphone-formatted PNGs back to RGB, even though
 // they are internally encoded differently. You can disable this conversion
-// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// by calling stbi_convert_iphone_png_to_rgb(0), in which case
 // you will always just get the native iphone "format" through (which
 // is BGR stored in RGB).
 //
@@ -383,6 +278,41 @@ distribute, and modify this file as you see fit.
 // says there's premultiplied data (currently only happens in iPhone images,
 // and only if iPhone convert-to-rgb processing is on).
 //
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
 
 
 #ifndef STBI_NO_STDIO
@@ -393,7 +323,7 @@ distribute, and modify this file as you see fit.
 
 enum
 {
-   STBI_default = 0, // only used for req_comp
+   STBI_default = 0, // only used for desired_channels
 
    STBI_grey       = 1,
    STBI_grey_alpha = 2,
@@ -401,7 +331,9 @@ enum
    STBI_rgb_alpha  = 4
 };
 
+#include <stdlib.h>
 typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
 
 #ifdef __cplusplus
 extern "C" {
@@ -429,34 +361,64 @@ typedef struct
    int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
-STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 // for stbi_load_from_file, file pointer is left pointing immediately after image
 #endif
 
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
 #ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
 
    #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
    #endif
 #endif
 
 #ifndef STBI_NO_HDR
    STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
    STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
-#endif
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
    STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
    STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_HDR
+#endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
@@ -477,11 +439,14 @@ STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
 // get image dimensions & components without fully decoding
 STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
 STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
-
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
 #endif
 
 
@@ -562,9 +527,10 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp
+#include <math.h>  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -576,6 +542,12 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #define STBI_ASSERT(x) assert(x)
 #endif
 
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
 
 #ifndef _MSC_VER
    #ifdef __cplusplus
@@ -620,18 +592,22 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
    #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
 #endif
 
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && defined(STBI_REALLOC)
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
 // ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC)
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC."
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)    malloc(sz)
-#define STBI_REALLOC(p,sz) realloc(p,sz)
-#define STBI_FREE(p)       free(p)
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
 #endif
 
 // x86/x64 detection
@@ -641,12 +617,14 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI__X86_TARGET
 #endif
 
-#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// NOTE: not clear do we actually need this for the 64-bit path?
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
-// this is just broken and gcc are jerks for not fixing it properly
-// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
 #define STBI_NO_SIMD
 #endif
 
@@ -665,7 +643,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI_NO_SIMD
 #endif
 
-#if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET)
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
 #define STBI_SSE2
 #include <emmintrin.h>
 
@@ -694,25 +672,27 @@ static int stbi__cpuid3(void)
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
-static int stbi__sse2_available()
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
 {
    int info3 = stbi__cpuid3();
    return ((info3 >> 26) & 1) != 0;
 }
+#endif
+
 #else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
-static int stbi__sse2_available()
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
 {
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
-   // GCC 4.8+ has a nice way to do this
-   return __builtin_cpu_supports("sse2");
-#else
-   // portable way to do this, preferably without using GCC inline ASM?
-   // just bail for now.
-   return 0;
-#endif
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
 }
+#endif
+
 #endif
 #endif
 
@@ -750,7 +730,7 @@ typedef struct
    stbi_uc buffer_start[128];
 
    stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
 
 
@@ -762,7 +742,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
    s->io.read = NULL;
    s->read_from_callbacks = 0;
    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = (stbi_uc *) buffer+len;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
 
 // initialize a callback-based context
@@ -774,6 +754,7 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *
    s->read_from_callbacks = 1;
    s->img_buffer_original = s->buffer_start;
    stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
 }
 
 #ifndef STBI_NO_STDIO
@@ -815,59 +796,76 @@ static void stbi__rewind(stbi__context *s)
    // but we just rewind to the beginning of the initial buffer, because
    // we only use it after doing 'test', which only ever looks at at most 92 bytes
    s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
 }
 
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
 #ifndef STBI_NO_JPEG
 static int      stbi__jpeg_test(stbi__context *s);
-static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
 static int      stbi__png_test(stbi__context *s);
-static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
 static int      stbi__bmp_test(stbi__context *s);
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
 static int      stbi__tga_test(stbi__context *s);
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
 static int      stbi__psd_test(stbi__context *s);
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
 static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
 static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
 static int      stbi__pic_test(stbi__context *s);
-static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
 static int      stbi__gif_test(stbi__context *s);
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
 static int      stbi__pnm_test(stbi__context *s);
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
@@ -890,6 +888,81 @@ static void *stbi__malloc(size_t size)
     return STBI_MALLOC(size);
 }
 
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -902,8 +975,8 @@ static void *stbi__malloc(size_t size)
    #define stbi__err(x,y)  stbi__err(x)
 #endif
 
-#define stbi__errpf(x,y)   ((float *) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
 
 STBIDEF void stbi_image_free(void *retval_from_stbi_load)
 {
@@ -925,33 +998,38 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
     stbi__vertically_flip_on_load = flag_true_if_should_flip;
 }
 
-static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 {
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
    #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
    #endif
    #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif
 
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
       return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
    }
    #endif
@@ -959,65 +1037,175 @@ static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *com
    #ifndef STBI_NO_TGA
    // test tga last because it's a crappy test!
    if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp);
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
    #endif
 
    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 {
-   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
 
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      stbi_uc temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
       }
    }
+}
 
-   return result;
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
+      bytes += slice_size; 
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 8) {
+      STBI_ASSERT(ri.bits_per_channel == 16);
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 16) {
+      STBI_ASSERT(ri.bits_per_channel == 8);
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
 }
 
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 {
    if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      float temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
-      }
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
    }
 }
-
+#endif
 
 #ifndef STBI_NO_STDIO
 
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, bufferlen, NULL, NULL);
+}
+#endif
+
 static FILE *stbi__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_MSC_VER) && _MSC_VER >= 1400
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+      return 0;
+	
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+      return 0;
+
+#if _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
    if (0 != fopen_s(&f, filename, mode))
       f=0;
 #else
@@ -1042,28 +1230,83 @@ STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req
    unsigned char *result;
    stbi__context s;
    stbi__start_file(&s,f);
-   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
    if (result) {
       // need to 'unget' all the characters in the IO buffer
       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    }
    return result;
 }
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
 #endif //!STBI_NO_STDIO
 
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
 STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
 STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s; 
+   stbi__start_mem(&s,buffer,len); 
+   
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
+   }
+
+   return result; 
 }
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
@@ -1071,13 +1314,14 @@ static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int
    unsigned char *data;
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
       if (hdr_data)
          stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
       return hdr_data;
    }
    #endif
-   data = stbi__load_flip(s, x, y, comp, req_comp);
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
    if (data)
       return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
@@ -1147,13 +1391,18 @@ STBIDEF int      stbi_is_hdr          (char const *filename)
    return result;
 }
 
-STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
 {
    #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
    stbi__context s;
    stbi__start_file(&s,f);
-   return stbi__hdr_test(&s);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
    #else
+   STBI_NOTUSED(f);
    return 0;
    #endif
 }
@@ -1166,18 +1415,21 @@ STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    return stbi__hdr_test(&s);
    #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
    return 0;
    #endif
 }
 
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+#ifndef STBI_NO_LINEAR
 static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 
-#ifndef STBI_NO_LINEAR
 STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
 STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
 STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
 STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
 
@@ -1286,17 +1538,23 @@ static stbi__uint32 stbi__get32be(stbi__context *s)
    return (z << 16) + stbi__get16be(s);
 }
 
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
 static int stbi__get16le(stbi__context *s)
 {
    int z = stbi__get8(s);
    return z + (stbi__get8(s) << 8);
 }
+#endif
 
+#ifndef STBI_NO_BMP
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
    return z + (stbi__get16le(s) << 16);
 }
+#endif
 
 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
@@ -1325,7 +1583,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    if (req_comp == img_n) return data;
    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
    if (good == NULL) {
       STBI_FREE(data);
       return stbi__errpuc("outofmem", "Out of memory");
@@ -1335,26 +1593,75 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
       unsigned char *src  = data + j * x * img_n   ;
       unsigned char *dest = good + j * x * req_comp;
 
-      #define COMBO(a,b)  ((a)*8+(b))
-      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
       // convert source image with img_n components to one with req_comp components;
       // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (COMBO(img_n, req_comp)) {
-         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
-         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
-         CASE(2,1) dest[0]=src[0]; break;
-         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
-         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
-         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
-         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
-         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
          default: STBI_ASSERT(0);
       }
-      #undef CASE
+      #undef STBI__CASE
    }
 
    STBI_FREE(data);
@@ -1365,7 +1672,9 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 {
    int i,k,n;
-   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1373,7 +1682,11 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
       for (k=0; k < n; ++k) {
          output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
       }
-      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
    }
    STBI_FREE(data);
    return output;
@@ -1385,7 +1698,9 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 {
    int i,k,n;
-   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1450,7 +1765,7 @@ typedef struct
    stbi__context *s;
    stbi__huffman huff_dc[4];
    stbi__huffman huff_ac[4];
-   stbi_uc dequant[4][64];
+   stbi__uint16 dequant[4][64];
    stbi__int16 fast_ac[4][1 << FAST_BITS];
 
 // sizes for components, interleaved MCUs
@@ -1486,6 +1801,9 @@ typedef struct
    int            succ_high;
    int            succ_low;
    int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
 
    int scan_n, order[4];
    int restart_interval, todo;
@@ -1498,7 +1816,8 @@ typedef struct
 
 static int stbi__build_huffman(stbi__huffman *h, int *count)
 {
-   int i,j,k=0,code;
+   int i,j,k=0;
+   unsigned int code;
    // build size list for each symbol (from JPEG spec)
    for (i=0; i < 16; ++i)
       for (j=0; j < count[i]; ++j)
@@ -1514,7 +1833,7 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
       if (h->size[k] == j) {
          while (h->size[k] == j)
             h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
       }
       // compute largest code + 1 for this size, preshifted as needed later
       h->maxcode[j] = code << (16-j);
@@ -1555,10 +1874,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
             // magnitude code followed by receive_extend code
             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
             int m = 1 << (magbits - 1);
-            if (k < m) k += (-1 << magbits) + 1;
+            if (k < m) k += (~0U << magbits) + 1;
             // if the result is small enough, we can fit it in fast_ac table
             if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
          }
       }
    }
@@ -1567,9 +1886,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 {
    do {
-      int b = j->nomore ? 0 : stbi__get8(j->s);
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
       if (b == 0xff) {
          int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
          if (c != 0) {
             j->marker = (unsigned char) c;
             j->nomore = 1;
@@ -1582,7 +1902,7 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 }
 
 // (1 << n) - 1
-static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 
 // decode a jpeg huffman value from the bitstream
 stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
@@ -1635,7 +1955,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 }
 
 // bias[n] = (-1<<n) + 1
-static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
@@ -1678,7 +1998,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static stbi_uc stbi__jpeg_dezigzag[64+15] =
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
 {
     0,  1,  8, 16,  9,  2,  3, 10,
    17, 24, 32, 25, 18, 11,  4,  5,
@@ -1694,7 +2014,7 @@ static stbi_uc stbi__jpeg_dezigzag[64+15] =
 };
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
 {
    int diff,dc,k;
    int t;
@@ -1904,7 +2224,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
 }
 
 #define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) << 12)
+#define stbi__fsh(x)  ((x) * 4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -1959,7 +2279,7 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
          //    (1|2|3|4|5|6|7)==0          0     seconds
          //    all separate               -0.047 seconds
          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0] << 2;
+         int dcterm = d[0]*4;
          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
       } else {
          STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
@@ -2403,7 +2723,7 @@ static stbi_uc stbi__get_marker(stbi__jpeg *j)
    x = stbi__get8(j->s);
    if (x != 0xff) return STBI__MARKER_none;
    while (x == 0xff)
-      x = stbi__get8(j->s);
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
    return x;
 }
 
@@ -2418,7 +2738,7 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
    j->code_bits = 0;
    j->code_buffer = 0;
    j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
    j->marker = STBI__MARKER_none;
    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
    j->eob_run = 0;
@@ -2550,7 +2870,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
    }
 }
 
-static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 {
    int i;
    for (i=0; i < 64; ++i)
@@ -2592,13 +2912,14 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          L = stbi__get16be(z->s)-2;
          while (L > 0) {
             int q = stbi__get8(z->s);
-            int p = q >> 4;
+            int p = q >> 4, sixteen = (p != 0);
             int t = q & 15,i;
-            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
             for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
-            L -= 65;
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
          }
          return L==0;
 
@@ -2631,12 +2952,50 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          }
          return L==0;
    }
+
    // check for comment block or APP blocks
    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
       return 1;
    }
-   return 0;
+
+   return stbi__err("unknown marker","Corrupt JPEG");
 }
 
 // after we see SOS
@@ -2679,6 +3038,28 @@ static int stbi__process_scan_header(stbi__jpeg *z)
    return 1;
 }
 
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
 static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 {
    stbi__context *s = z->s;
@@ -2688,7 +3069,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
    c = stbi__get8(s);
-   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
    s->img_n = c;
    for (i=0; i < c; ++i) {
       z->img_comp[i].data = NULL;
@@ -2697,11 +3078,12 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 
+   z->rgb = 0;
    for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
       z->img_comp[i].id = stbi__get8(s);
-      if (z->img_comp[i].id != i+1)   // JFIF requires
-         if (z->img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
-            return stbi__err("bad component ID","Corrupt JPEG");
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
       q = stbi__get8(s);
       z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
       z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
@@ -2710,7 +3092,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (scan != STBI__SCAN_load) return 1;
 
-   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 
    for (i=0; i < s->img_n; ++i) {
       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
@@ -2722,6 +3104,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    z->img_v_max = v_max;
    z->img_mcu_w = h_max * 8;
    z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
@@ -2733,28 +3116,27 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       // the bogus oversized data from using interleaved MCUs and their
       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
       // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
-
-      if (z->img_comp[i].raw_data == NULL) {
-         for(--i; i >= 0; --i) {
-            STBI_FREE(z->img_comp[i].raw_data);
-            z->img_comp[i].data = NULL;
-         }
-         return stbi__err("outofmem", "Out of memory");
-      }
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
       // align blocks for idct using mmx/sse
       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      z->img_comp[i].linebuf = NULL;
       if (z->progressive) {
-         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
-         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
-         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      } else {
-         z->img_comp[i].coeff = 0;
-         z->img_comp[i].raw_coeff = 0;
       }
    }
 
@@ -2773,6 +3155,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 {
    int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
    z->marker = STBI__MARKER_none; // initialize cached marker to empty
    m = stbi__get_marker(z);
    if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
@@ -2814,12 +3198,15 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
                if (x == 255) {
                   j->marker = stbi__get8(j->s);
                   break;
-               } else if (x != 0) {
-                  return stbi__err("junk before marker", "Corrupt JPEG");
                }
             }
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
       } else {
          if (!stbi__process_marker(j, m)) return 0;
       }
@@ -3038,38 +3425,9 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_
    return out;
 }
 
-#ifdef STBI_JPEG_OLD
-// this is the same YCbCr-to-RGB calculation that stb_image has used
-// historically before the algorithm changes in 1.49
-#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 16) + 32768; // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr*float2fixed(1.40200f);
-      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
-      b = y_fixed                            + cb*float2fixed(1.77200f);
-      r >>= 16;
-      g >>= 16;
-      b >>= 16;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#else
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 {
    int i;
@@ -3078,9 +3436,9 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed +  cr* float2fixed(1.40200f);
-      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3094,7 +3452,6 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       out += step;
    }
 }
-#endif
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
@@ -3213,9 +3570,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed + cr* float2fixed(1.40200f);
-      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3241,18 +3598,14 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 #ifdef STBI_SSE2
    if (stbi__sse2_available()) {
       j->idct_block_kernel = stbi__idct_simd;
-      #ifndef STBI_JPEG_OLD
       j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      #endif
       j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
    }
 #endif
 
 #ifdef STBI_NEON
    j->idct_block_kernel = stbi__idct_simd;
-   #ifndef STBI_JPEG_OLD
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   #endif
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
@@ -3260,23 +3613,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 // clean up the temporary component buffers
 static void stbi__cleanup_jpeg(stbi__jpeg *j)
 {
-   int i;
-   for (i=0; i < j->s->img_n; ++i) {
-      if (j->img_comp[i].raw_data) {
-         STBI_FREE(j->img_comp[i].raw_data);
-         j->img_comp[i].raw_data = NULL;
-         j->img_comp[i].data = NULL;
-      }
-      if (j->img_comp[i].raw_coeff) {
-         STBI_FREE(j->img_comp[i].raw_coeff);
-         j->img_comp[i].raw_coeff = 0;
-         j->img_comp[i].coeff = 0;
-      }
-      if (j->img_comp[i].linebuf) {
-         STBI_FREE(j->img_comp[i].linebuf);
-         j->img_comp[i].linebuf = NULL;
-      }
-   }
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
 }
 
 typedef struct
@@ -3289,9 +3626,16 @@ typedef struct
    int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
 static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 {
-   int n, decode_n;
+   int n, decode_n, is_rgb;
    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
    // validate req_comp
@@ -3301,9 +3645,11 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
    // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n;
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
 
-   if (z->s->img_n == 3 && n < 3)
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
       decode_n = 1;
    else
       decode_n = z->s->img_n;
@@ -3340,7 +3686,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
       }
 
       // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
       // now go ahead and resample
@@ -3363,7 +3709,39 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if (n >= 3) {
             stbi_uc *y = coutput[0];
             if (z->s->img_n == 3) {
-               z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
             } else
                for (i=0; i < z->s->img_x; ++i) {
                   out[0] = out[1] = out[2] = y[i];
@@ -3371,37 +3749,70 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
                   out += n;
                }
          } else {
-            stbi_uc *y = coutput[0];
-            if (n == 1)
-               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-            else
-               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
          }
       }
       stbi__cleanup_jpeg(z);
       *out_x = z->s->img_x;
       *out_y = z->s->img_y;
-      if (comp) *comp  = z->s->img_n; // report original components, not output
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
       return output;
    }
 }
 
-static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   return load_jpeg_image(&j, x,y,comp,req_comp);
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
 }
 
 static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
    stbi__rewind(s);
+   STBI_FREE(j);
    return r;
 }
 
@@ -3413,15 +3824,18 @@ static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
    }
    if (x) *x = j->s->img_x;
    if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
    return 1;
 }
 
 static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__jpeg j;
-   j.s = s;
-   return stbi__jpeg_info_raw(&j, x, y, comp);
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
 }
 #endif
 
@@ -3467,7 +3881,7 @@ stbi_inline static int stbi__bit_reverse(int v, int bits)
    return stbi__bitreverse16(v) >> (16-bits);
 }
 
-static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 {
    int i,k=0;
    int code, next_code[16], sizes[17];
@@ -3502,10 +3916,10 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
          z->size [c] = (stbi_uc     ) s;
          z->value[c] = (stbi__uint16) i;
          if (s <= STBI__ZFAST_BITS) {
-            int k = stbi__bit_reverse(next_code[s],s);
-            while (k < (1 << STBI__ZFAST_BITS)) {
-               z->fast[k] = fastv;
-               k += (1 << s);
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
             }
          }
          ++next_code[s];
@@ -3594,14 +4008,15 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
-   int cur, limit;
+   int cur, limit, old_limit;
    z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
    cur   = (int) (z->zout     - z->zout_start);
-   limit = (int) (z->zout_end - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
    while (cur + n > limit)
       limit *= 2;
-   q = (char *) STBI_REALLOC(z->zout_start, limit);
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
    z->zout_start = q;
    z->zout       = q + cur;
@@ -3609,18 +4024,18 @@ static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room
    return 1;
 }
 
-static int stbi__zlength_base[31] = {
+static const int stbi__zlength_base[31] = {
    3,4,5,6,7,8,9,10,11,13,
    15,17,19,23,27,31,35,43,51,59,
    67,83,99,115,131,163,195,227,258,0,0 };
 
-static int stbi__zlength_extra[31]=
+static const int stbi__zlength_extra[31]=
 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 
-static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
 
-static int stbi__zdist_extra[32] =
+static const int stbi__zdist_extra[32] =
 { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
 
 static int stbi__parse_huffman_block(stbi__zbuf *a)
@@ -3667,7 +4082,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
 
 static int stbi__compute_huffman_codes(stbi__zbuf *a)
 {
-   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
    stbi__zhuffman z_codelength;
    stbi_uc lencodes[286+32+137];//padding for maximum single op
    stbi_uc codelength_sizes[19];
@@ -3676,6 +4091,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    int hlit  = stbi__zreceive(a,5) + 257;
    int hdist = stbi__zreceive(a,5) + 1;
    int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
 
    memset(codelength_sizes, 0, sizeof(codelength_sizes));
    for (i=0; i < hclen; ++i) {
@@ -3685,33 +4101,35 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
 
    n = 0;
-   while (n < hlit + hdist) {
+   while (n < ntot) {
       int c = stbi__zhuffman_decode(a, &z_codelength);
       if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
       if (c < 16)
          lencodes[n++] = (stbi_uc) c;
-      else if (c == 16) {
-         c = stbi__zreceive(a,2)+3;
-         memset(lencodes+n, lencodes[n-1], c);
-         n += c;
-      } else if (c == 17) {
-         c = stbi__zreceive(a,3)+3;
-         memset(lencodes+n, 0, c);
-         n += c;
-      } else {
-         STBI_ASSERT(c == 18);
-         c = stbi__zreceive(a,7)+11;
-         memset(lencodes+n, 0, c);
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = stbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = stbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
          n += c;
       }
    }
-   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
    if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
    return 1;
 }
 
-static int stbi__parse_uncomperssed_block(stbi__zbuf *a)
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
 {
    stbi_uc header[4];
    int len,nlen,k;
@@ -3753,9 +4171,24 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-// @TODO: should statically initialize these for optimal thread safety
-static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
-static void stbi__init_zdefaults(void)
+static const stbi_uc stbi__zdefault_length[288] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
 {
    int i;   // use <= to match clearly with spec
    for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
@@ -3765,6 +4198,7 @@ static void stbi__init_zdefaults(void)
 
    for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
 }
+*/
 
 static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 {
@@ -3777,13 +4211,12 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       final = stbi__zreceive(a,1);
       type = stbi__zreceive(a,2);
       if (type == 0) {
-         if (!stbi__parse_uncomperssed_block(a)) return 0;
+         if (!stbi__parse_uncompressed_block(a)) return 0;
       } else if (type == 3) {
          return 0;
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
             if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
@@ -3908,7 +4341,7 @@ static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 
 static int stbi__check_png_header(stbi__context *s)
 {
-   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
    int i;
    for (i=0; i < 8; ++i)
       if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
@@ -3919,6 +4352,7 @@ typedef struct
 {
    stbi__context *s;
    stbi_uc *idata, *expanded, *out;
+   int depth;
 } stbi__png;
 
 
@@ -3953,35 +4387,40 @@ static int stbi__paeth(int a, int b, int c)
    return c;
 }
 
-static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
+   int bytes = (depth == 16? 2 : 1);
    stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
    stbi__uint32 img_len, img_width_bytes;
    int k;
    int img_n = s->img_n; // copy it into a local for later
 
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc(x * y * out_n); // extra bytes to write off the end into
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
    img_len = (img_width_bytes + 1) * y;
-   if (s->img_x == x && s->img_y == y) {
-      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   } else { // interlaced:
-      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   }
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
    for (j=0; j < y; ++j) {
       stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior = cur - stride;
+      stbi_uc *prior;
       int filter = *raw++;
-      int filter_bytes = img_n;
-      int width = x;
+
       if (filter > 4)
          return stbi__err("invalid filter","Corrupt PNG");
 
@@ -3991,6 +4430,7 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          filter_bytes = 1;
          width = img_width_bytes;
       }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
@@ -4014,6 +4454,14 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          raw += img_n;
          cur += out_n;
          prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
       } else {
          raw += 1;
          cur += 1;
@@ -4022,38 +4470,47 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
       // this is a little gross, so that we don't switch per-pixel or per-component
       if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*img_n;
-         #define CASE(f) \
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
              case f:     \
                 for (k=0; k < nk; ++k)
          switch (filter) {
             // "none" filter turns into a memcpy here; make that explicit.
             case STBI__F_none:         memcpy(cur, raw, nk); break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
          }
-         #undef CASE
+         #undef STBI__CASE
          raw += nk;
       } else {
          STBI_ASSERT(img_n+1 == out_n);
-         #define CASE(f) \
+         #define STBI__CASE(f) \
              case f:     \
-                for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n) \
-                   for (k=0; k < img_n; ++k)
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
          switch (filter) {
-            CASE(STBI__F_none)         cur[k] = raw[k]; break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-out_n]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-out_n] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],0,0)); break;
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
          }
-         #undef CASE
       }
    }
 
@@ -4110,25 +4567,36 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
          }
          if (img_n != out_n) {
+            int q;
             // insert alpha = 255
-            stbi_uc *cur = a->out + stride*j;
-            int i;
+            cur = a->out + stride*j;
             if (img_n == 1) {
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*2+1] = 255;
-                  cur[i*2+0] = cur[i];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
                }
             } else {
                STBI_ASSERT(img_n == 3);
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*4+3] = 255;
-                  cur[i*4+2] = cur[i*3+2];
-                  cur[i*4+1] = cur[i*3+1];
-                  cur[i*4+0] = cur[i*3+0];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
                }
             }
          }
       }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
    }
 
    return 1;
@@ -4136,13 +4604,15 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
 static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 {
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
    stbi_uc *final;
    int p;
    if (!interlaced)
       return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
    // de-interlacing
-   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -4162,8 +4632,8 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
             for (i=0; i < x; ++i) {
                int out_y = j*yspc[p]+yorig[p];
                int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
-                      a->out + (j*x+i)*out_n, out_n);
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
             }
          }
          STBI_FREE(a->out);
@@ -4201,12 +4671,37 @@ static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
    return 1;
 }
 
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
 static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
 {
    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
    stbi_uc *p, *temp_out, *orig = a->out;
 
-   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
    // between here and free(out) below, exitting would leak
@@ -4272,9 +4767,10 @@ static void stbi__de_iphone(stbi__png *z)
             stbi_uc a = p[3];
             stbi_uc t = p[0];
             if (a) {
-               p[0] = p[2] * 255 / a;
-               p[1] = p[1] * 255 / a;
-               p[2] =  t   * 255 / a;
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
             } else {
                p[0] = p[2];
                p[2] = t;
@@ -4293,14 +4789,15 @@ static void stbi__de_iphone(stbi__png *z)
    }
 }
 
-#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 
 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 {
    stbi_uc palette[1024], pal_img_n=0;
-   stbi_uc has_trans=0, tc[3];
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, depth=0, is_iphone=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
    stbi__context *s = z->s;
 
    z->expanded = NULL;
@@ -4325,8 +4822,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
             s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
             s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            depth = stbi__get8(s);  if (depth != 1 && depth != 2 && depth != 4 && depth != 8)  return stbi__err("1/2/4/8-bit only","PNG not supported: 1/2/4/8-bit only");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
             filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
@@ -4374,8 +4872,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
-               for (k=0; k < s->img_n; ++k)
-                  tc[k] = (stbi_uc) (stbi__get16be(s) & 255) * stbi__depth_scale_table[depth]; // non 8-bit images will be larger
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
             }
             break;
          }
@@ -4386,11 +4887,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
                stbi_uc *p;
                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
                while (ioff + c.length > idata_limit)
                   idata_limit *= 2;
-               p = (stbi_uc *) STBI_REALLOC(z->idata, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
                z->idata = p;
             }
             if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
@@ -4404,7 +4907,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan != STBI__SCAN_load) return 1;
             if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
             // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * depth + 7) / 8; // bytes per line, per component
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
             raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
             z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
             if (z->expanded == NULL) return 0; // zlib should set error
@@ -4413,9 +4916,14 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                s->img_out_n = s->img_n+1;
             else
                s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, depth, color, interlace)) return 0;
-            if (has_trans)
-               if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
                stbi__de_iphone(z);
             if (pal_img_n) {
@@ -4425,6 +4933,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (req_comp >= 3) s->img_out_n = req_comp;
                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
                   return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
             }
             STBI_FREE(z->expanded); z->expanded = NULL;
             return 1;
@@ -4452,21 +4963,28 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
    }
 }
 
-static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
 {
-   unsigned char *result=NULL;
+   void *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
-         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
          p->s->img_out_n = req_comp;
          if (result == NULL) return result;
       }
       *x = p->s->img_x;
       *y = p->s->img_y;
-      if (n) *n = p->s->img_out_n;
+      if (n) *n = p->s->img_n;
    }
    STBI_FREE(p->out);      p->out      = NULL;
    STBI_FREE(p->expanded); p->expanded = NULL;
@@ -4475,11 +4993,11 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
    return result;
 }
 
-static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi__png p;
    p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp);
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 }
 
 static int stbi__png_test(stbi__context *s)
@@ -4508,6 +5026,19 @@ static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
    p.s = s;
    return stbi__png_info_raw(&p, x, y, comp);
 }
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
 #endif
 
 // Microsoft/Windows BMP image
@@ -4541,11 +5072,11 @@ static int stbi__high_bit(unsigned int z)
 {
    int n=0;
    if (z == 0) return -1;
-   if (z >= 0x10000) n += 16, z >>= 16;
-   if (z >= 0x00100) n +=  8, z >>=  8;
-   if (z >= 0x00010) n +=  4, z >>=  4;
-   if (z >= 0x00004) n +=  2, z >>=  2;
-   if (z >= 0x00002) n +=  1, z >>=  1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1; z >>=  1; }
    return n;
 }
 
@@ -4559,36 +5090,46 @@ static int stbi__bitcount(unsigned int a)
    return a & 0xff;
 }
 
-static int stbi__shiftsigned(int v, int shift, int bits)
-{
-   int result;
-   int z=0;
-
-   if (shift < 0) v <<= -shift;
-   else v >>= shift;
-   result = v;
-
-   z = bits;
-   while (z < 8) {
-      result += v >> z;
-      z += bits;
-   }
-   return result;
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v >= 0 && v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
 }
 
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+typedef struct
 {
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, fake_a=0;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,compress=0,width;
-   int bpp, flip_vertically, pad, target, offset, hsz;
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
    stbi__get32le(s); // discard filesize
    stbi__get16le(s); // discard reserved
    stbi__get16le(s); // discard reserved
-   offset = stbi__get32le(s);
-   hsz = stbi__get32le(s);
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+
    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
    if (hsz == 12) {
       s->img_x = stbi__get16le(s);
@@ -4598,15 +5139,9 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
       s->img_y = stbi__get32le(s);
    }
    if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   bpp = stbi__get16le(s);
-   if (bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-   if (hsz == 12) {
-      if (bpp < 24)
-         psize = (offset - 14 - 24) / 3;
-   } else {
-      compress = stbi__get32le(s);
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
@@ -4620,27 +5155,25 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s);
             stbi__get32le(s);
          }
-         if (bpp == 16 || bpp == 32) {
-            mr = mg = mb = 0;
+         if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (bpp == 32) {
-                  mr = 0xffu << 16;
-                  mg = 0xffu <<  8;
-                  mb = 0xffu <<  0;
-                  ma = 0xffu << 24;
-                  fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
-                  STBI_NOTUSED(fake_a);
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
                } else {
-                  mr = 31u << 10;
-                  mg = 31u <<  5;
-                  mb = 31u <<  0;
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
                }
             } else if (compress == 3) {
-               mr = stbi__get32le(s);
-               mg = stbi__get32le(s);
-               mb = stbi__get32le(s);
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
                // not documented, but generated by photoshop and handled by mspaint
-               if (mr == mg && mg == mb) {
+               if (info->mr == info->mg && info->mg == info->mb) {
                   // ?!?!?
                   return stbi__errpuc("bad BMP", "bad BMP");
                }
@@ -4648,11 +5181,13 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
-         STBI_ASSERT(hsz == 108 || hsz == 124);
-         mr = stbi__get32le(s);
-         mg = stbi__get32le(s);
-         mb = stbi__get32le(s);
-         ma = stbi__get32le(s);
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters
@@ -4663,63 +5198,121 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s); // discard reserved
          }
       }
-      if (bpp < 16)
-         psize = (offset - 14 - hsz) >> 2;
    }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
    s->img_n = ma ? 4 : 3;
    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
       target = req_comp;
    else
       target = s->img_n; // if they want monochrome, we'll post-convert
-   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (bpp < 16) {
+   if (info.bpp < 16) {
       int z=0;
       if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
       for (i=0; i < psize; ++i) {
          pal[i][2] = stbi__get8(s);
          pal[i][1] = stbi__get8(s);
          pal[i][0] = stbi__get8(s);
-         if (hsz != 12) stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
          pal[i][3] = 255;
       }
-      stbi__skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
-      if (bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (bpp == 8) width = s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
       else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
       pad = (-width)&3;
-      for (j=0; j < (int) s->img_y; ++j) {
-         for (i=0; i < (int) s->img_x; i += 2) {
-            int v=stbi__get8(s),v2=0;
-            if (bpp == 4) {
-               v2 = v & 15;
-               v >>= 4;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
             }
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
-            if (i+1 == (int) s->img_x) break;
-            v = (bpp == 8) ? stbi__get8(s) : v2;
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
          }
-         stbi__skip(s, pad);
       }
    } else {
       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
       int z = 0;
       int easy=0;
-      stbi__skip(s, offset - 14 - hsz);
-      if (bpp == 24) width = 3 * s->img_x;
-      else if (bpp == 16) width = 2*s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
       else /* bpp = 32 and pad = 0 */ width=0;
       pad = (-width) & 3;
-      if (bpp == 24) {
+      if (info.bpp == 24) {
          easy = 1;
-      } else if (bpp == 32) {
+      } else if (info.bpp == 32) {
          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
             easy = 2;
       }
@@ -4740,29 +5333,38 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                out[z+0] = stbi__get8(s);
                z += 3;
                a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = a;
             }
          } else {
+            int bpp = info.bpp;
             for (i=0; i < (int) s->img_x; ++i) {
                stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               int a;
+               unsigned int a;
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = STBI__BYTECAST(a);
             }
          }
          stbi__skip(s, pad);
       }
    }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
    if (flip_vertically) {
       stbi_uc t;
       for (j=0; j < (int) s->img_y>>1; ++j) {
          stbi_uc *p1 = out +      j     *s->img_x*target;
          stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
          for (i=0; i < (int) s->img_x*target; ++i) {
-            t = p1[i], p1[i] = p2[i], p2[i] = t;
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
          }
       }
    }
@@ -4782,20 +5384,55 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
 // Targa Truevision - TGA
 // by Jonathan Dummer
 #ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
 static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
 {
-    int tga_w, tga_h, tga_comp;
-    int sz;
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
     stbi__get8(s);                   // discard Offset
-    sz = stbi__get8(s);              // color type
-    if( sz > 1 ) {
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
         stbi__rewind(s);
         return 0;      // only RGB or indexed allowed
     }
-    sz = stbi__get8(s);              // image type
-    // only RGB or grey allowed, +/- RLE
-    if ((sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11)) return 0;
-    stbi__skip(s,9);
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
     tga_w = stbi__get16le(s);
     if( tga_w < 1 ) {
         stbi__rewind(s);
@@ -4806,45 +5443,81 @@ static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
         stbi__rewind(s);
         return 0;   // test height
     }
-    sz = stbi__get8(s);               // bits per pixel
-    // only RGB or RGBA or grey allowed
-    if ((sz != 8) && (sz != 16) && (sz != 24) && (sz != 32)) {
-        stbi__rewind(s);
-        return 0;
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
     }
-    tga_comp = sz;
     if (x) *x = tga_w;
     if (y) *y = tga_h;
-    if (comp) *comp = tga_comp / 8;
+    if (comp) *comp = tga_comp;
     return 1;                   // seems to have passed everything
 }
 
 static int stbi__tga_test(stbi__context *s)
 {
-   int res;
-   int sz;
+   int res = 0;
+   int sz, tga_color_type;
    stbi__get8(s);      //   discard Offset
-   sz = stbi__get8(s);   //   color type
-   if ( sz > 1 ) return 0;   //   only RGB or indexed allowed
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
    sz = stbi__get8(s);   //   image type
-   if ( (sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11) ) return 0;   //   only RGB or grey allowed, +/- RLE
-   stbi__get16be(s);      //   discard palette start
-   stbi__get16be(s);      //   discard palette length
-   stbi__get8(s);         //   discard bits per palette color entry
-   stbi__get16be(s);      //   discard x origin
-   stbi__get16be(s);      //   discard y origin
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test width
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test height
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
    sz = stbi__get8(s);   //   bits per pixel
-   if ( (sz != 8) && (sz != 16) && (sz != 24) && (sz != 32) )
-      res = 0;
-   else
-      res = 1;
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
    stbi__rewind(s);
    return res;
 }
 
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    //   read in the TGA header stuff
    int tga_offset = stbi__get8(s);
@@ -4859,16 +5532,18 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    int tga_width = stbi__get16le(s);
    int tga_height = stbi__get16le(s);
    int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp = tga_bits_per_pixel / 8;
+   int tga_comp, tga_rgb16=0;
    int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
    //   image data
    unsigned char *tga_data;
    unsigned char *tga_palette = NULL;
    int i, j;
-   unsigned char raw_data[4];
+   unsigned char raw_data[4] = {0};
    int RLE_count = 0;
    int RLE_repeating = 0;
    int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
 
    //   do a tiny bit of precessing
    if ( tga_image_type >= 8 )
@@ -4876,41 +5551,33 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       tga_image_type -= 8;
       tga_is_RLE = 1;
    }
-   /* int tga_alpha_bits = tga_inverted & 15; */
    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
-   //   error check
-   if ( //(tga_indexed) ||
-      (tga_width < 1) || (tga_height < 1) ||
-      (tga_image_type < 1) || (tga_image_type > 3) ||
-      ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16) &&
-      (tga_bits_per_pixel != 24) && (tga_bits_per_pixel != 32))
-      )
-   {
-      return NULL; // we don't report this as a bad TGA because we don't even know if it's TGA
-   }
-
    //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed )
-   {
-      tga_comp = tga_palette_bits / 8;
-   }
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
    //   tga info
    *x = tga_width;
    *y = tga_height;
    if (comp) *comp = tga_comp;
 
-   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
    // skip to the data's starting position (offset usually = 0)
    stbi__skip(s, tga_offset );
 
-   if ( !tga_indexed && !tga_is_RLE) {
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
       for (i=0; i < tga_height; ++i) {
-         int y = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + y*tga_width*tga_comp;
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
          stbi__getn(s, tga_row, tga_width * tga_comp);
       }
    } else  {
@@ -4920,15 +5587,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_palette_bits / 8 );
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
          if (!tga_palette) {
             STBI_FREE(tga_data);
             return stbi__errpuc("outofmem", "Out of memory");
          }
-         if (!stbi__getn(s, tga_palette, tga_palette_len * tga_palette_bits / 8 )) {
-            STBI_FREE(tga_data);
-            STBI_FREE(tga_palette);
-            return stbi__errpuc("bad palette", "Corrupt TGA");
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
          }
       }
       //   load the data
@@ -4958,23 +5632,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
             //   load however much data we did have
             if ( tga_indexed )
             {
-               //   read in 1 byte, then perform the lookup
-               int pal_idx = stbi__get8(s);
-               if ( pal_idx >= tga_palette_len )
-               {
-                  //   invalid index
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
                   pal_idx = 0;
                }
-               pal_idx *= tga_bits_per_pixel / 8;
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = tga_palette[pal_idx+j];
                }
-            } else
-            {
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
                //   read in the data raw
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = stbi__get8(s);
                }
             }
@@ -5013,8 +5686,8 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       }
    }
 
-   // swap RGB
-   if (tga_comp >= 3)
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
    {
       unsigned char* tga_pixel = tga_data;
       for (i=0; i < tga_width * tga_height; ++i)
@@ -5050,13 +5723,53 @@ static int stbi__psd_test(stbi__context *s)
    return r;
 }
 
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 {
-   int   pixelCount;
+   int pixelCount;
    int channelCount, compression;
-   int channel, i, count, len;
+   int channel, i;
+   int bitdepth;
    int w,h;
    stbi_uc *out;
+   STBI_NOTUSED(ri);
 
    // Check identifier
    if (stbi__get32be(s) != 0x38425053)   // "8BPS"
@@ -5079,8 +5792,9 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    w = stbi__get32be(s);
 
    // Make sure the depth is 8 bits.
-   if (stbi__get16be(s) != 8)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 bit");
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
    // Make sure the color mode is RGB.
    // Valid options are:
@@ -5112,8 +5826,18 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    if (compression > 1)
       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
    // Create the destination image.
-   out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    pixelCount = w*h;
 
@@ -5130,7 +5854,7 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
       //     Else if n is 128, noop.
       // Endloop
 
-      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
       // which we're going to just skip.
       stbi__skip(s, h * channelCount * 2 );
 
@@ -5145,61 +5869,86 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
                *p = (channel == 3 ? 255 : 0);
          } else {
             // Read the RLE data.
-            count = 0;
-            while (count < pixelCount) {
-               len = stbi__get8(s);
-               if (len == 128) {
-                  // No-op.
-               } else if (len < 128) {
-                  // Copy next len+1 bytes literally.
-                  len++;
-                  count += len;
-                  while (len) {
-                     *p = stbi__get8(s);
-                     p += 4;
-                     len--;
-                  }
-               } else if (len > 128) {
-                  stbi_uc   val;
-                  // Next -len+1 bytes in the dest are replicated from next source byte.
-                  // (Interpret len as a negative 8-bit int.)
-                  len ^= 0x0FF;
-                  len += 2;
-                  val = stbi__get8(s);
-                  count += len;
-                  while (len) {
-                     *p = val;
-                     p += 4;
-                     len--;
-                  }
-               }
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
             }
          }
       }
 
    } else {
       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit value for each pixel in the image.
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
       // Read the data by channel.
       for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out + channel;
-         if (channel > channelCount) {
+         if (channel >= channelCount) {
             // Fill this channel with default data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = channel == 3 ? 255 : 0;
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
          } else {
-            // Read the data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = stbi__get8(s);
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
          }
       }
    }
 
+   // convert to desired output format
    if (req_comp && req_comp != 4) {
-      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
 
@@ -5351,7 +6100,6 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
 
                   if (count >= 128) { // Repeated
                      stbi_uc value[4];
-                     int i;
 
                      if (count==128)
                         count = stbi__get16be(s);
@@ -5384,10 +6132,13 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
    return result;
 }
 
-static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
 {
    stbi_uc *result;
-   int i, x,y;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
 
    for (i=0; i<92; ++i)
       stbi__get8(s);
@@ -5395,14 +6146,14 @@ static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int re
    x = stbi__get16be(s);
    y = stbi__get16be(s);
    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
    stbi__get32be(s); //skip `ratio'
    stbi__get16be(s); //skip `fields'
    stbi__get16be(s); //skip `pad'
 
    // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc(x*y*4);
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -5440,10 +6191,12 @@ typedef struct
 {
    int w,h;
    stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history; 
    int flags, bgindex, ratio, transparent, eflags;
    stbi_uc  pal[256][4];
    stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[4096];
+   stbi__gif_lzw codes[8192];
    stbi_uc *color_table;
    int parse, step;
    int lflags;
@@ -5451,6 +6204,7 @@ typedef struct
    int max_x, max_y;
    int cur_x, cur_y;
    int line_size;
+   int delay;
 } stbi__gif;
 
 static int stbi__gif_test_raw(stbi__context *s)
@@ -5511,19 +6265,22 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__gif g;
-   if (!stbi__gif_header(s, &g, comp, 1)) {
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
       stbi__rewind( s );
       return 0;
    }
-   if (x) *x = g.w;
-   if (y) *y = g.h;
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
    return 1;
 }
 
 static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 {
    stbi_uc *p, *c;
+   int idx; 
 
    // recurse to decode the prefixes, since the linked-list is backwards,
    // and working backwards through an interleaved image would be nasty
@@ -5532,10 +6289,12 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 
    if (g->cur_y >= g->max_y) return;
 
-   p = &g->out[g->cur_x + g->cur_y];
-   c = &g->color_table[g->codes[code].suffix * 4];
+   idx = g->cur_x + g->cur_y; 
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;  
 
-   if (c[3] >= 128) {
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels; 
       p[0] = c[2];
       p[1] = c[1];
       p[2] = c[0];
@@ -5558,7 +6317,7 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
 {
    stbi_uc lzw_cs;
-   stbi__int32 len, code;
+   stbi__int32 len, init_code;
    stbi__uint32 first;
    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
    stbi__gif_lzw *p;
@@ -5571,10 +6330,10 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    codemask = (1 << codesize) - 1;
    bits = 0;
    valid_bits = 0;
-   for (code = 0; code < clear; code++) {
-      g->codes[code].prefix = -1;
-      g->codes[code].first = (stbi_uc) code;
-      g->codes[code].suffix = (stbi_uc) code;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
    }
 
    // support no starting clear code
@@ -5609,11 +6368,16 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
                stbi__skip(s,len);
             return g->out;
          } else if (code <= avail) {
-            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
 
             if (oldcode >= 0) {
                p = &g->codes[avail++];
-               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
                p->prefix = (stbi__int16) oldcode;
                p->first = g->codes[oldcode].first;
                p->suffix = (code == avail) ? p->first : g->codes[code].first;
@@ -5635,43 +6399,71 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    }
 }
 
-static void stbi__fill_gif_background(stbi__gif *g)
-{
-   int i;
-   stbi_uc *c = g->pal[g->bgindex];
-   // @OPTIMIZE: write a dword at a time
-   for (i = 0; i < g->w * g->h * 4; i += 4) {
-      stbi_uc *p  = &g->out[i];
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-}
-
 // this function is designed to support animated gifs, although stb_image doesn't support it
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
 {
-   int i;
-   stbi_uc *old_out = 0;
+   int dispose; 
+   int first_frame; 
+   int pi; 
+   int pcount; 
+   STBI_NOTUSED(req_comp);
 
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0; 
    if (g->out == 0) {
       if (!stbi__gif_header(s, g, comp,0))     return 0; // stbi__g_failure_reason set by stbi__gif_header
       g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
+      g->background = (stbi_uc *) stbi__malloc(4 * g->w * g->h); 
+      g->history = (stbi_uc *) stbi__malloc(g->w * g->h); 
       if (g->out == 0)                      return stbi__errpuc("outofmem", "Out of memory");
-      stbi__fill_gif_background(g);
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background; 
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame. 
+      memset( g->out, 0x00, 4 * g->w * g->h ); 
+      memset( g->background, 0x00, 4 * g->w * g->h ); // state of the background (starts transparent)
+      memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+      first_frame = 1; 
    } else {
-      // animated-gif-only path
-      if (((g->eflags & 0x1C) >> 2) == 3) {
-         old_out = g->out;
-         g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
-         if (g->out == 0)                   return stbi__errpuc("outofmem", "Out of memory");
-         memcpy(g->out, old_out, g->w*g->h*4);
+      // second frame - how do we dispoase of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2; 
+      pcount = g->w * g->h; 
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
+            }
+         }
+      } else if (dispose == 2) { 
+         // restore what was changed last frame to background before that frame; 
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just 
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
       }
+
+      // background is what out is after the undoing of the previou frame; 
+      memcpy( g->background, g->out, 4 * g->w * g->h ); 
    }
 
+   // clear my history; 
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
    for (;;) {
-      switch (stbi__get8(s)) {
+      int tag = stbi__get8(s); 
+      switch (tag) {
          case 0x2C: /* Image Descriptor */
          {
             stbi__int32 x, y, w, h;
@@ -5706,38 +6498,60 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
                g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
-               for (i=0; i < 256; ++i)  // @OPTIMIZE: stbi__jpeg_reset only the previous transparent
-                  g->pal[i][3] = 255;
-               if (g->transparent >= 0 && (g->eflags & 0x01))
-                  g->pal[g->transparent][3] = 0;
                g->color_table = (stbi_uc *) g->pal;
             } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");
-
+               return stbi__errpuc("missing color table", "Corrupt GIF");            
+            
             o = stbi__process_gif_raster(s, g);
             if (o == NULL) return NULL;
 
-            if (req_comp && req_comp != 4)
-               o = stbi__convert_format(o, 4, req_comp, g->w, g->h);
+            // if this was the first frame, 
+            pcount = g->w * g->h; 
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
+                  }
+               }
+            }
+
             return o;
          }
 
          case 0x21: // Comment Extension.
          {
             int len;
-            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+            int ext = stbi__get8(s); 
+            if (ext == 0xF9) { // Graphic Control Extension.
                len = stbi__get8(s);
                if (len == 4) {
                   g->eflags = stbi__get8(s);
-                  stbi__get16le(s); // delay
-                  g->transparent = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255; 
+                  } 
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0; 
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1); 
+                     g->transparent = -1; 
+                  }
                } else {
                   stbi__skip(s, len);
                   break;
                }
-            }
-            while ((len = stbi__get8(s)) != 0)
+            } 
+            while ((len = stbi__get8(s)) != 0) {
                stbi__skip(s, len);
+            }
             break;
          }
 
@@ -5750,19 +6564,91 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
    }
 }
 
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0; 
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0; 
+      stbi__gif g;
+      int stride; 
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0; 
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers; 
+            stride = g.w * g.h * 4; 
+         
+            if (out) {
+               out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
+               if (delays) {
+                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride ); 
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride ); 
+            if (layers >= 2) {
+               two_back = out - 2 * stride; 
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay; 
+            }
+         }
+      } while (u != 0); 
+
+      // free temp buffer; 
+      STBI_FREE(g.out); 
+      STBI_FREE(g.history); 
+      STBI_FREE(g.background); 
+
+      // do the final conversion after loading everything; 
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers; 
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type."); 
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *u = 0;
    stbi__gif g;
    memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
 
-   u = stbi__gif_load_next(s, &g, comp, req_comp);
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
    if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
    if (u) {
       *x = g.w;
       *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames. 
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
    }
 
+   // free buffers needed for multiple frame loading; 
+   STBI_FREE(g.history);
+   STBI_FREE(g.background); 
+
    return u;
 }
 
@@ -5776,20 +6662,24 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s)
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
 {
-   const char *signature = "#?RADIANCE\n";
    int i;
    for (i=0; signature[i]; ++i)
       if (stbi__get8(s) != signature[i])
-         return 0;
+          return 0;
+   stbi__rewind(s);
    return 1;
 }
 
 static int stbi__hdr_test(stbi__context* s)
 {
-   int r = stbi__hdr_test_core(s);
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
    stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
    return r;
 }
 
@@ -5843,7 +6733,7 @@ static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
    }
 }
 
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    char buffer[STBI__HDR_BUFLEN];
    char *token;
@@ -5854,10 +6744,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    int len;
    unsigned char count, value;
    int i, j, k, c1,c2, z;
-
+   const char *headerToken;
+   STBI_NOTUSED(ri);
 
    // Check identifier
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
       return stbi__errpf("not HDR", "Corrupt HDR image");
 
    // Parse header
@@ -5886,8 +6778,13 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    if (comp) *comp = 3;
    if (req_comp == 0) req_comp = 3;
 
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
    // Read data
-   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
 
    // Load image data
    // image data is stored as some number of sca
@@ -5926,20 +6823,29 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          len <<= 8;
          len |= stbi__get8(s);
          if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
 
          for (k = 0; k < 4; ++k) {
+            int nleft;
             i = 0;
-            while (i < width) {
+            while ((nleft = width - i) > 0) {
                count = stbi__get8(s);
                if (count > 128) {
                   // Run
                   value = stbi__get8(s);
                   count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = value;
                } else {
                   // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = stbi__get8(s);
                }
@@ -5948,7 +6854,8 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          for (i=0; i < width; ++i)
             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
       }
-      STBI_FREE(scanline);
+      if (scanline)
+         STBI_FREE(scanline);
    }
 
    return hdr_data;
@@ -5959,8 +6866,13 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
    char buffer[STBI__HDR_BUFLEN];
    char *token;
    int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0) {
+   if (stbi__hdr_test(s) == 0) {
        stbi__rewind( s );
        return 0;
    }
@@ -5997,29 +6909,17 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s,12);
-   hsz = stbi__get32le(s);
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (hsz == 12) {
-      *x = stbi__get16le(s);
-      *y = stbi__get16le(s);
-   } else {
-      *x = stbi__get32le(s);
-      *y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = stbi__get16le(s) / 8;
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) *comp = info.ma ? 4 : 3;
    return 1;
 }
 #endif
@@ -6027,7 +6927,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_PSD
 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int channelCount;
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
    if (stbi__get32be(s) != 0x38425053) {
        stbi__rewind( s );
        return 0;
@@ -6044,7 +6947,8 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    }
    *y = stbi__get32be(s);
    *x = stbi__get32be(s);
-   if (stbi__get16be(s) != 8) {
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
        stbi__rewind( s );
        return 0;
    }
@@ -6055,22 +6959,61 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = 4;
    return 1;
 }
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   (void) stbi__get32be(s);
+   (void) stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
 #endif
 
 #ifndef STBI_NO_PIC
 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int act_comp=0,num_packets=0,chained;
+   int act_comp=0,num_packets=0,chained,dummy;
    stbi__pic_packet packets[10];
 
-   stbi__skip(s, 92);
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
 
    *x = stbi__get16be(s);
    *y = stbi__get16be(s);
-   if (stbi__at_eof(s))  return 0;
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
    if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-       stbi__rewind( s );
-       return 0;
+      stbi__rewind( s );
+      return 0;
    }
 
    stbi__skip(s, 8);
@@ -6130,16 +7073,22 @@ static int      stbi__pnm_test(stbi__context *s)
    return 1;
 }
 
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *out;
+   STBI_NOTUSED(ri);
+
    if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
       return 0;
+
    *x = s->img_x;
    *y = s->img_y;
-   *comp = s->img_n;
+   if (comp) *comp = s->img_n;
 
-   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
@@ -6157,8 +7106,16 @@ static int      stbi__pnm_isspace(char c)
 
 static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
 {
-   while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-      *c = (char) stbi__get8(s);
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
 }
 
 static int      stbi__pnm_isdigit(char c)
@@ -6180,16 +7137,20 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
 
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int maxv;
+   int maxv, dummy;
    char c, p, t;
 
-   stbi__rewind( s );
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
 
    // Get identifier
    p = (char) stbi__get8(s);
    t = (char) stbi__get8(s);
    if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
+       stbi__rewind(s);
        return 0;
    }
 
@@ -6255,6 +7216,19 @@ static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   return 0;
+}
+
 #ifndef STBI_NO_STDIO
 STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
 {
@@ -6276,6 +7250,27 @@ STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
    fseek(f,pos,SEEK_SET);
    return r;
 }
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
 #endif // !STBI_NO_STDIO
 
 STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
@@ -6292,10 +7287,64 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
    return stbi__info_main(&s,x,y,comp);
 }
 
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
 #endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
       2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
       2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
       2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
@@ -6436,3 +7485,46 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
       0.50  (2006-11-19)
               first released version
 */
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/external/include/stb_image_write.h b/external/include/stb_image_write.h
index 59052f9c..ebaa0c55 100644
--- a/external/include/stb_image_write.h
+++ b/external/include/stb_image_write.h
@@ -1,5 +1,5 @@
-/* stb_image_write - v0.98 - public domain - http://nothings.org/stb/stb_image_write.h
-   writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010
+/* stb_image_write - v1.11 - public domain - http://nothings.org/stb/stb_image_write.h
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
                                      no warranty implied; use at your own risk
 
    Before #including,
@@ -10,31 +10,74 @@
 
    Will probably not work correctly with strict-aliasing optimizations.
 
+   If using a modern Microsoft Compiler, non-safe versions of CRT calls may cause 
+   compilation warnings or even errors. To avoid this, also before #including,
+
+       #define STBI_MSC_SECURE_CRT
+
 ABOUT:
 
-   This header file is a library for writing images to C stdio. It could be
-   adapted to write to memory or a general streaming interface; let me know.
+   This header file is a library for writing images to C stdio or a callback.
 
    The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation. This library is designed
-   for source code compactness and simplicitly, not optimal image file size
-   or run-time performance.
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
 
 BUILDING:
 
    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
    malloc,realloc,free.
-   You can define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
 
 USAGE:
 
-   There are four functions, one for each image file format:
+   There are five functions, one for each image file format:
 
      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
      int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
      int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
 
    Each function returns 0 on failure and non-0 on success.
 
@@ -58,69 +101,144 @@
    writer, both because it is in BGR order and because it may have padding
    at the end of the line.)
 
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
    HDR expects linear float data. Since the format is always 32-bit rgb(e)
    data, alpha (if provided) is discarded, and for monochrome data it is
    replicated across all three channels.
 
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+   
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
 CREDITS:
 
-   PNG/BMP/TGA
-      Sean Barrett
-   HDR
-      Baldur Karlsson
-   TGA monochrome:
-      Jean-Sebastien Guay
-   misc enhancements:
-      Tim Kelsey
+
+   Sean Barrett           -    PNG/BMP/TGA 
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
    bugfixes:
       github:Chribba
-      
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+
 LICENSE
 
-This software is in the public domain. Where that dedication is not
-recognized, you are granted a perpetual, irrevocable license to copy,
-distribute, and modify this file as you see fit.      
+  See end of file for license information.
+
 */
 
 #ifndef INCLUDE_STB_IMAGE_WRITE_H
 #define INCLUDE_STB_IMAGE_WRITE_H
 
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
 #ifdef __cplusplus
-extern "C" {
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
 #endif
 
-extern int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-extern int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+extern int stbi_write_tga_with_rle;
+extern int stbi_write_png_compression_level;
+extern int stbi_write_force_png_filter;
+#endif
 
-#ifdef __cplusplus
-}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBI_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
 #endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
 #endif//INCLUDE_STB_IMAGE_WRITE_H
 
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
 #include <stdarg.h>
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && defined(STBIW_REALLOC)
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 // ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC)
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC."
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)    malloc(sz)
-#define STBIW_REALLOC(p,sz) realloc(p,sz)
-#define STBIW_FREE(p)       free(p)
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
 #endif
+
+
 #ifndef STBIW_MEMMOVE
 #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
 #endif
@@ -131,22 +249,127 @@ extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const fl
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi__flip_vertically_on_write=0;
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi__flip_vertically_on_write=0;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+      return 0;
+	
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+      return 0;
+
+#if _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
 
-static void writefv(FILE *f, const char *fmt, va_list v)
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 {
    while (*fmt) {
       switch (*fmt++) {
          case ' ': break;
-         case '1': { unsigned char x = (unsigned char) va_arg(v, int); fputc(x,f); break; }
-         case '2': { int x = va_arg(v,int); unsigned char b[2];
-                     b[0] = (unsigned char) x; b[1] = (unsigned char) (x>>8);
-                     fwrite(b,2,1,f); break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int); unsigned char b[4];
-                     b[0]=(unsigned char)x; b[1]=(unsigned char)(x>>8);
-                     b[2]=(unsigned char)(x>>16); b[3]=(unsigned char)(x>>24);
-                     fwrite(b,4,1,f); break; }
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
          default:
             STBIW_ASSERT(0);
             return;
@@ -154,22 +377,70 @@ static void writefv(FILE *f, const char *fmt, va_list v)
    }
 }
 
-static void write3(FILE *f, unsigned char a, unsigned char b, unsigned char c)
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
    unsigned char arr[3];
    arr[0] = a, arr[1] = b, arr[2] = c;
-   fwrite(arr, 3, 1, f);
+   s->func(s->context, arr, 3);
 }
 
-static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
 {
    unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      s->func(s->context, &d[comp - 1], 1);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            s->func(s->context, d, 1);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      s->func(s->context, &d[comp - 1], 1);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
    stbiw_uint32 zero = 0;
-   int i,j,k, j_end;
+   int i,j, j_end;
 
    if (y <= 0)
       return;
 
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
    if (vdir < 0)
       j_end = -1, j = y-1;
    else
@@ -178,81 +449,165 @@ static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp,
    for (; j != j_end; j += vdir) {
       for (i=0; i < x; ++i) {
          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         if (write_alpha < 0)
-            fwrite(&d[comp-1], 1, 1, f);
-         switch (comp) {
-            case 1: fwrite(d, 1, 1, f);
-                    break;
-            case 2: if (expand_mono)
-                       write3(f, d[0],d[0],d[0]); // monochrome bmp
-                    else
-                       fwrite(d, 1, 1, f);  // monochrome TGA
-                    break;
-            case 4:
-               if (!write_alpha) {
-                  // composite against pink background
-                  for (k=0; k < 3; ++k)
-                     px[k] = bg[k] + ((d[k] - bg[k]) * d[3])/255;
-                  write3(f, px[1-rgb_dir],px[1],px[1+rgb_dir]);
-                  break;
-               }
-               /* FALLTHROUGH */
-            case 3:
-               write3(f, d[1-rgb_dir],d[1],d[1+rgb_dir]);
-               break;
-         }
-         if (write_alpha > 0)
-            fwrite(&d[comp-1], 1, 1, f);
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
       }
-      fwrite(&zero,scanline_pad,1,f);
+      s->func(s->context, &zero, scanline_pad);
    }
 }
 
-static int outfile(char const *filename, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
 {
-   FILE *f;
-   if (y < 0 || x < 0) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
       va_list v;
       va_start(v, fmt);
-      writefv(f, fmt, v);
+      stbiw__writefv(s, fmt, v);
       va_end(v);
-      write_pixels(f,rgb_dir,vdir,x,y,comp,data,alpha,pad,expand_mono);
-      fclose(f);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
    }
-   return f != NULL;
 }
 
-int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
    int pad = (-x*3) & 3;
-   return outfile(filename,-1,-1,x,y,comp,1,(void *) data,0,pad,
+   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
            "11 4 22 4" "4 44 22 444444",
            'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
             40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
 }
 
-int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
 {
    int has_alpha = (comp == 2 || comp == 4);
    int colorbytes = has_alpha ? comp-1 : comp;
    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-   return outfile(filename, -1,-1, x, y, comp, 0, (void *) data, has_alpha, 0,
-                  "111 221 2222 11", 0,0,format, 0,0,0, 0,0,x,y, (colorbytes+has_alpha)*8, has_alpha*8);
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               s->func(s->context, &header, 1);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               s->func(s->context, &header, 1);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+   }
+   return 1;
 }
 
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
 // *************************************************************************************************
 // Radiance RGBE HDR writer
 // by Baldur Karlsson
+
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
-void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 {
    int exponent;
    float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-   if (maxcomp < 1e-32) {
+   if (maxcomp < 1e-32f) {
       rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
    } else {
       float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
@@ -264,23 +619,23 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    }
 }
 
-void stbiw__write_run_data(FILE *f, int length, unsigned char databyte)
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 {
-   unsigned char lengthbyte = (unsigned char) (length+128);
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
    STBIW_ASSERT(length+128 <= 255);
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(&databyte, 1, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
 }
 
-void stbiw__write_dump_data(FILE *f, int length, unsigned char *data)
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 {
-   unsigned char lengthbyte = (unsigned char )(length & 0xff);
+   unsigned char lengthbyte = STBIW_UCHAR(length);
    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(data, length, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
 }
 
-void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scratch, const float *scanline)
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 {
    unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
    unsigned char rgbe[4];
@@ -293,31 +648,31 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    /* skip RLE for images too small or large */
    if (width < 8 || width >= 32768) {
       for (x=0; x < width; x++) {
-         switch (comp) {
+         switch (ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
-         fwrite(rgbe, 4, 1, f);
+         s->func(s->context, rgbe, 4);
       }
    } else {
       int c,r;
       /* encode into scratch buffer */
       for (x=0; x < width; x++) {
-         switch(comp) {
+         switch(ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
@@ -327,7 +682,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
          scratch[x + width*3] = rgbe[3];
       }
 
-      fwrite(scanlineheader, 4, 1, f);
+      s->func(s->context, scanlineheader, 4);
 
       /* RLE each component separately */
       for (c=0; c < 4; c++) {
@@ -348,7 +703,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
             while (x < r) {
                int len = r-x;
                if (len > 128) len = 128;
-               stbiw__write_dump_data(f, len, &comp[x]);
+               stbiw__write_dump_data(s, len, &comp[x]);
                x += len;
             }
             // if there's a run, output it
@@ -360,7 +715,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
                while (x < r) {
                   int len = r-x;
                   if (len > 127) len = 127;
-                  stbiw__write_run_data(f, len, comp[x]);
+                  stbiw__write_run_data(s, len, comp[x]);
                   x += len;
                }
             }
@@ -369,28 +724,59 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    }
 }
 
-int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
 {
-   int i;
-   FILE *f;
-   if (y <= 0 || x <= 0 || data == NULL) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
-      /* Each component is stored separately. Allocate scratch space for full output scanline. */
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
       unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      fprintf(f, "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"      );
-      fprintf(f, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n"                 , y, x);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef STBI_MSC_SECURE_CRT
+      len = sprintf_s(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
       for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(f, x, comp, scratch, data + comp*i*x);
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
       STBIW_FREE(scratch);
-      fclose(f);
+      return 1;
    }
-   return f != NULL;
 }
 
-/////////////////////////////////////////////////////////
-// PNG
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
 
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 #define stbiw__sbraw(a) ((int *) (a) - 2)
 #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
@@ -407,7 +793,7 @@ int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *da
 static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 {
    int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC(*arr ? stbiw__sbraw(*arr) : 0, itemsize * m + sizeof(int)*2);
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
    STBIW_ASSERT(p);
    if (p) {
       if (!*arr) ((int *) p)[1] = 0;
@@ -420,7 +806,7 @@ static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
 {
    while (*bitcount >= 8) {
-      stbiw__sbpush(data, (unsigned char) *bitbuffer);
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
       *bitbuffer >>= 8;
       *bitcount -= 8;
    }
@@ -471,8 +857,14 @@ static unsigned int stbiw__zhash(unsigned char *data)
 
 #define stbiw__ZHASH   16384
 
-unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 {
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
    static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
    static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
    static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
@@ -480,7 +872,9 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
    unsigned int bitbuf=0;
    int i,j, bitcount=0;
    unsigned char *out = NULL;
-   unsigned char **hash_table[stbiw__ZHASH]; // 64KB on the stack!
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+   if (hash_table == NULL)
+      return NULL;
    if (quality < 5) quality = 5;
 
    stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
@@ -552,43 +946,81 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
 
    for (i=0; i < stbiw__ZHASH; ++i)
       (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
 
    {
       // compute adler32 on input
-      unsigned int i=0, s1=1, s2=0, blocklen = data_len % 5552;
-      int j=0;
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
       while (j < data_len) {
          for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
          s1 %= 65521, s2 %= 65521;
          j += blocklen;
          blocklen = 5552;
       }
-      stbiw__sbpush(out, (unsigned char) (s2 >> 8));
-      stbiw__sbpush(out, (unsigned char) s2);
-      stbiw__sbpush(out, (unsigned char) (s1 >> 8));
-      stbiw__sbpush(out, (unsigned char) s1);
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
    }
    *out_len = stbiw__sbn(out);
    // make returned pointer freeable
    STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
    return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
-unsigned int stbiw__crc32(unsigned char *buffer, int len)
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
 {
-   static unsigned int crc_table[256];
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
    unsigned int crc = ~0u;
-   int i,j;
-   if (crc_table[1] == 0)
-      for(i=0; i < 256; i++)
-         for (crc_table[i]=i, j=0; j < 8; ++j)
-            crc_table[i] = (crc_table[i] >> 1) ^ (crc_table[i] & 1 ? 0xedb88320 : 0);
+   int i;
    for (i=0; i < len; ++i)
       crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
    return ~crc;
+#endif
 }
 
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=(unsigned char)(a),(o)[1]=(unsigned char)(b),(o)[2]=(unsigned char)(c),(o)[3]=(unsigned char)(d),(o)+=4)
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
 #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
 #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
 
@@ -601,66 +1033,97 @@ static void stbiw__wpcrc(unsigned char **data, int len)
 static unsigned char stbiw__paeth(int a, int b, int c)
 {
    int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return (unsigned char) a;
-   if (pb <= pc) return (unsigned char) b;
-   return (unsigned char) c;
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
 }
 
-unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
 {
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+    
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel    
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
    int ctype[5] = { -1, 0, 4, 2, 6 };
    unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
    unsigned char *out,*o, *filt, *zlib;
    signed char *line_buffer;
-   int i,j,k,p,zlen;
+   int j,zlen;
 
    if (stride_bytes == 0)
       stride_bytes = x * n;
 
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
    for (j=0; j < y; ++j) {
-      static int mapping[] = { 0,1,2,3,4 };
-      static int firstmap[] = { 0,1,0,5,6 };
-      int *mymap = j ? mapping : firstmap;
-      int best = 0, bestval = 0x7fffffff;
-      for (p=0; p < 2; ++p) {
-         for (k= p?best:0; k < 5; ++k) {
-            int type = mymap[k],est=0;
-            unsigned char *z = pixels + stride_bytes*j;
-            for (i=0; i < n; ++i)
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
-                  case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
-                  case 5: line_buffer[i] = z[i]; break;
-                  case 6: line_buffer[i] = z[i]; break;
-               }
-            for (i=n; i < x*n; ++i) {
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i] - z[i-n]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
-                  case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
-                  case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
-                  case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-               }
-            }
-            if (p) break;
-            for (i=0; i < x*n; ++i)
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
                est += abs((signed char) line_buffer[i]);
-            if (est < bestval) { bestval = est; best = k; }
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
          }
       }
-      // when we get here, best contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) best;
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
    }
    STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
    STBIW_FREE(filt);
    if (!zlib) return 0;
 
@@ -676,7 +1139,7 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    stbiw__wp32(o, x);
    stbiw__wp32(o, y);
    *o++ = 8;
-   *o++ = (unsigned char) ctype[n];
+   *o++ = STBIW_UCHAR(ctype[n]);
    *o++ = 0;
    *o++ = 0;
    *o++ = 0;
@@ -698,22 +1161,403 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    return out;
 }
 
-int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
 {
    FILE *f;
    int len;
-   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (!png) return 0;
-   f = fopen(filename, "wb");
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
    if (!f) { STBIW_FREE(png); return 0; }
    fwrite(png, 1, len, f);
    fclose(f);
    STBIW_FREE(png);
    return 1;
 }
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, diff, end0pos;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0; dataOff<64; dataOff+=8) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(i=0; i<64; ++i) {
+      float v = CDU[i]*fdtbl[i];
+      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      const unsigned char *imageData = (const unsigned char *)data;
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      int x, y, pos;
+      for(y = 0; y < height; y += 8) {
+         for(x = 0; x < width; x += 8) {
+            float YDU[64], UDU[64], VDU[64];
+            for(row = y, pos = 0; row < y+8; ++row) {
+               // row >= height => use last input row
+               int clamped_row = (row < height) ? row : height - 1;
+               int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+               for(col = x; col < x+8; ++col, ++pos) {
+                  float r, g, b;
+                  // if col >= width => use pixel from last input column
+                  int p = base_p + ((col < width) ? col : (width-1))*comp;
+
+                  r = imageData[p+0];
+                  g = imageData[p+ofsG];
+                  b = imageData[p+ofsB];
+                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
+                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
+                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
+               }
+            }
+
+            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs 
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
       0.98 (2015-04-08)
              added STBIW_MALLOC, STBIW_ASSERT etc
       0.97 (2015-01-18)
@@ -733,3 +1577,45 @@ int stbi_write_png(char const *filename, int x, int y, int comp, const void *dat
              first public release
       0.90   first internal release
 */
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/external/include/tiny_obj_loader.h b/external/include/tiny_obj_loader.h
new file mode 100644
index 00000000..7d0c3844
--- /dev/null
+++ b/external/include/tiny_obj_loader.h
@@ -0,0 +1,3333 @@
+/*
+The MIT License (MIT)
+
+Copyright (c) 2012-Present, Syoyo Fujita and many contributors.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//
+// version 2.0.0 : Add new object oriented API. 1.x API is still provided.
+//                 * Support line primitive.
+//                 * Support points primitive.
+//                 * Support multiple search path for .mtl(v1 API).
+//                 * Support vertex weight `vw`(as an tinyobj extension)
+//                 * Support escaped whitespece in mtllib
+//                 * Add robust triangulation using Mapbox earcut(TINYOBJLOADER_USE_MAPBOX_EARCUT).
+// version 1.4.0 : Modifed ParseTextureNameAndOption API
+// version 1.3.1 : Make ParseTextureNameAndOption API public
+// version 1.3.0 : Separate warning and error message(breaking API of LoadObj)
+// version 1.2.3 : Added color space extension('-colorspace') to tex opts.
+// version 1.2.2 : Parse multiple group names.
+// version 1.2.1 : Added initial support for line('l') primitive(PR #178)
+// version 1.2.0 : Hardened implementation(#175)
+// version 1.1.1 : Support smoothing groups(#162)
+// version 1.1.0 : Support parsing vertex color(#144)
+// version 1.0.8 : Fix parsing `g` tag just after `usemtl`(#138)
+// version 1.0.7 : Support multiple tex options(#126)
+// version 1.0.6 : Add TINYOBJLOADER_USE_DOUBLE option(#124)
+// version 1.0.5 : Ignore `Tr` when `d` exists in MTL(#43)
+// version 1.0.4 : Support multiple filenames for 'mtllib'(#112)
+// version 1.0.3 : Support parsing texture options(#85)
+// version 1.0.2 : Improve parsing speed by about a factor of 2 for large
+// files(#105)
+// version 1.0.1 : Fixes a shape is lost if obj ends with a 'usemtl'(#104)
+// version 1.0.0 : Change data structure. Change license from BSD to MIT.
+//
+
+//
+// Use this in *one* .cc
+//   #define TINYOBJLOADER_IMPLEMENTATION
+//   #include "tiny_obj_loader.h"
+//
+
+#ifndef TINY_OBJ_LOADER_H_
+#define TINY_OBJ_LOADER_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace tinyobj {
+
+// TODO(syoyo): Better C++11 detection for older compiler
+#if __cplusplus > 199711L
+#define TINYOBJ_OVERRIDE override
+#else
+#define TINYOBJ_OVERRIDE
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#if __has_warning("-Wzero-as-null-pointer-constant")
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
+#endif
+
+#pragma clang diagnostic ignored "-Wpadded"
+
+#endif
+
+// https://en.wikipedia.org/wiki/Wavefront_.obj_file says ...
+//
+//  -blendu on | off                       # set horizontal texture blending
+//  (default on)
+//  -blendv on | off                       # set vertical texture blending
+//  (default on)
+//  -boost real_value                      # boost mip-map sharpness
+//  -mm base_value gain_value              # modify texture map values (default
+//  0 1)
+//                                         #     base_value = brightness,
+//                                         gain_value = contrast
+//  -o u [v [w]]                           # Origin offset             (default
+//  0 0 0)
+//  -s u [v [w]]                           # Scale                     (default
+//  1 1 1)
+//  -t u [v [w]]                           # Turbulence                (default
+//  0 0 0)
+//  -texres resolution                     # texture resolution to create
+//  -clamp on | off                        # only render texels in the clamped
+//  0-1 range (default off)
+//                                         #   When unclamped, textures are
+//                                         repeated across a surface,
+//                                         #   when clamped, only texels which
+//                                         fall within the 0-1
+//                                         #   range are rendered.
+//  -bm mult_value                         # bump multiplier (for bump maps
+//  only)
+//
+//  -imfchan r | g | b | m | l | z         # specifies which channel of the file
+//  is used to
+//                                         # create a scalar or bump texture.
+//                                         r:red, g:green,
+//                                         # b:blue, m:matte, l:luminance,
+//                                         z:z-depth..
+//                                         # (the default for bump is 'l' and
+//                                         for decal is 'm')
+//  bump -imfchan r bumpmap.tga            # says to use the red channel of
+//  bumpmap.tga as the bumpmap
+//
+// For reflection maps...
+//
+//   -type sphere                           # specifies a sphere for a "refl"
+//   reflection map
+//   -type cube_top    | cube_bottom |      # when using a cube map, the texture
+//   file for each
+//         cube_front  | cube_back   |      # side of the cube is specified
+//         separately
+//         cube_left   | cube_right
+//
+// TinyObjLoader extension.
+//
+//   -colorspace SPACE                      # Color space of the texture. e.g.
+//   'sRGB` or 'linear'
+//
+
+#ifdef TINYOBJLOADER_USE_DOUBLE
+//#pragma message "using double"
+typedef double real_t;
+#else
+//#pragma message "using float"
+typedef float real_t;
+#endif
+
+typedef enum {
+  TEXTURE_TYPE_NONE,  // default
+  TEXTURE_TYPE_SPHERE,
+  TEXTURE_TYPE_CUBE_TOP,
+  TEXTURE_TYPE_CUBE_BOTTOM,
+  TEXTURE_TYPE_CUBE_FRONT,
+  TEXTURE_TYPE_CUBE_BACK,
+  TEXTURE_TYPE_CUBE_LEFT,
+  TEXTURE_TYPE_CUBE_RIGHT
+} texture_type_t;
+
+struct texture_option_t {
+  texture_type_t type;      // -type (default TEXTURE_TYPE_NONE)
+  real_t sharpness;         // -boost (default 1.0?)
+  real_t brightness;        // base_value in -mm option (default 0)
+  real_t contrast;          // gain_value in -mm option (default 1)
+  real_t origin_offset[3];  // -o u [v [w]] (default 0 0 0)
+  real_t scale[3];          // -s u [v [w]] (default 1 1 1)
+  real_t turbulence[3];     // -t u [v [w]] (default 0 0 0)
+  int texture_resolution;   // -texres resolution (No default value in the spec.
+                            // We'll use -1)
+  bool clamp;               // -clamp (default false)
+  char imfchan;  // -imfchan (the default for bump is 'l' and for decal is 'm')
+  bool blendu;   // -blendu (default on)
+  bool blendv;   // -blendv (default on)
+  real_t bump_multiplier;  // -bm (for bump maps only, default 1.0)
+
+  // extension
+  std::string colorspace;  // Explicitly specify color space of stored texel
+                           // value. Usually `sRGB` or `linear` (default empty).
+};
+
+struct material_t {
+  std::string name;
+
+  real_t ambient[3];
+  real_t diffuse[3];
+  real_t specular[3];
+  real_t transmittance[3];
+  real_t emission[3];
+  real_t shininess;
+  real_t ior;       // index of refraction
+  real_t dissolve;  // 1 == opaque; 0 == fully transparent
+  // illumination model (see http://www.fileformat.info/format/material/)
+  int illum;
+
+  int dummy;  // Suppress padding warning.
+
+  std::string ambient_texname;             // map_Ka
+  std::string diffuse_texname;             // map_Kd
+  std::string specular_texname;            // map_Ks
+  std::string specular_highlight_texname;  // map_Ns
+  std::string bump_texname;                // map_bump, map_Bump, bump
+  std::string displacement_texname;        // disp
+  std::string alpha_texname;               // map_d
+  std::string reflection_texname;          // refl
+
+  texture_option_t ambient_texopt;
+  texture_option_t diffuse_texopt;
+  texture_option_t specular_texopt;
+  texture_option_t specular_highlight_texopt;
+  texture_option_t bump_texopt;
+  texture_option_t displacement_texopt;
+  texture_option_t alpha_texopt;
+  texture_option_t reflection_texopt;
+
+  // PBR extension
+  // http://exocortex.com/blog/extending_wavefront_mtl_to_support_pbr
+  real_t roughness;            // [0, 1] default 0
+  real_t metallic;             // [0, 1] default 0
+  real_t sheen;                // [0, 1] default 0
+  real_t clearcoat_thickness;  // [0, 1] default 0
+  real_t clearcoat_roughness;  // [0, 1] default 0
+  real_t anisotropy;           // aniso. [0, 1] default 0
+  real_t anisotropy_rotation;  // anisor. [0, 1] default 0
+  real_t pad0;
+  std::string roughness_texname;  // map_Pr
+  std::string metallic_texname;   // map_Pm
+  std::string sheen_texname;      // map_Ps
+  std::string emissive_texname;   // map_Ke
+  std::string normal_texname;     // norm. For normal mapping.
+
+  texture_option_t roughness_texopt;
+  texture_option_t metallic_texopt;
+  texture_option_t sheen_texopt;
+  texture_option_t emissive_texopt;
+  texture_option_t normal_texopt;
+
+  int pad2;
+
+  std::map<std::string, std::string> unknown_parameter;
+
+#ifdef TINY_OBJ_LOADER_PYTHON_BINDING
+  // For pybind11
+  std::array<double, 3> GetDiffuse() {
+    std::array<double, 3> values;
+    values[0] = double(diffuse[0]);
+    values[1] = double(diffuse[1]);
+    values[2] = double(diffuse[2]);
+
+    return values;
+  }
+
+  std::array<double, 3> GetSpecular() {
+    std::array<double, 3> values;
+    values[0] = double(specular[0]);
+    values[1] = double(specular[1]);
+    values[2] = double(specular[2]);
+
+    return values;
+  }
+
+  std::array<double, 3> GetTransmittance() {
+    std::array<double, 3> values;
+    values[0] = double(transmittance[0]);
+    values[1] = double(transmittance[1]);
+    values[2] = double(transmittance[2]);
+
+    return values;
+  }
+
+  std::array<double, 3> GetEmission() {
+    std::array<double, 3> values;
+    values[0] = double(emission[0]);
+    values[1] = double(emission[1]);
+    values[2] = double(emission[2]);
+
+    return values;
+  }
+
+  std::array<double, 3> GetAmbient() {
+    std::array<double, 3> values;
+    values[0] = double(ambient[0]);
+    values[1] = double(ambient[1]);
+    values[2] = double(ambient[2]);
+
+    return values;
+  }
+
+  void SetDiffuse(std::array<double, 3> &a) {
+    diffuse[0] = real_t(a[0]);
+    diffuse[1] = real_t(a[1]);
+    diffuse[2] = real_t(a[2]);
+  }
+
+  void SetAmbient(std::array<double, 3> &a) {
+    ambient[0] = real_t(a[0]);
+    ambient[1] = real_t(a[1]);
+    ambient[2] = real_t(a[2]);
+  }
+
+  void SetSpecular(std::array<double, 3> &a) {
+    specular[0] = real_t(a[0]);
+    specular[1] = real_t(a[1]);
+    specular[2] = real_t(a[2]);
+  }
+
+  void SetTransmittance(std::array<double, 3> &a) {
+    transmittance[0] = real_t(a[0]);
+    transmittance[1] = real_t(a[1]);
+    transmittance[2] = real_t(a[2]);
+  }
+
+  std::string GetCustomParameter(const std::string &key) {
+    std::map<std::string, std::string>::const_iterator it =
+        unknown_parameter.find(key);
+
+    if (it != unknown_parameter.end()) {
+      return it->second;
+    }
+    return std::string();
+  }
+
+#endif
+};
+
+struct tag_t {
+  std::string name;
+
+  std::vector<int> intValues;
+  std::vector<real_t> floatValues;
+  std::vector<std::string> stringValues;
+};
+
+struct joint_and_weight_t {
+  int joint_id;
+  real_t weight;
+};
+
+struct skin_weight_t {
+  int vertex_id;  // Corresponding vertex index in `attrib_t::vertices`.
+                  // Compared to `index_t`, this index must be positive and
+                  // start with 0(does not allow relative indexing)
+  std::vector<joint_and_weight_t> weightValues;
+};
+
+// Index struct to support different indices for vtx/normal/texcoord.
+// -1 means not used.
+struct index_t {
+  int vertex_index;
+  int normal_index;
+  int texcoord_index;
+};
+
+struct mesh_t {
+  std::vector<index_t> indices;
+  std::vector<unsigned char>
+      num_face_vertices;          // The number of vertices per
+                                  // face. 3 = triangle, 4 = quad,
+                                  // ... Up to 255 vertices per face.
+  std::vector<int> material_ids;  // per-face material ID
+  std::vector<unsigned int> smoothing_group_ids;  // per-face smoothing group
+                                                  // ID(0 = off. positive value
+                                                  // = group id)
+  std::vector<tag_t> tags;                        // SubD tag
+};
+
+// struct path_t {
+//  std::vector<int> indices;  // pairs of indices for lines
+//};
+
+struct lines_t {
+  // Linear flattened indices.
+  std::vector<index_t> indices;        // indices for vertices(poly lines)
+  std::vector<int> num_line_vertices;  // The number of vertices per line.
+};
+
+struct points_t {
+  std::vector<index_t> indices;  // indices for points
+};
+
+struct shape_t {
+  std::string name;
+  mesh_t mesh;
+  lines_t lines;
+  points_t points;
+};
+
+// Vertex attributes
+struct attrib_t {
+  std::vector<real_t> vertices;  // 'v'(xyz)
+
+  // For backward compatibility, we store vertex weight in separate array.
+  std::vector<real_t> vertex_weights;  // 'v'(w)
+  std::vector<real_t> normals;         // 'vn'
+  std::vector<real_t> texcoords;       // 'vt'(uv)
+
+  // For backward compatibility, we store texture coordinate 'w' in separate
+  // array.
+  std::vector<real_t> texcoord_ws;  // 'vt'(w)
+  std::vector<real_t> colors;       // extension: vertex colors
+
+  //
+  // TinyObj extension.
+  //
+
+  // NOTE(syoyo): array index is based on the appearance order.
+  // To get a corresponding skin weight for a specific vertex id `vid`,
+  // Need to reconstruct a look up table: `skin_weight_t::vertex_id` == `vid`
+  // (e.g. using std::map, std::unordered_map)
+  std::vector<skin_weight_t> skin_weights;
+
+  attrib_t() {}
+
+  //
+  // For pybind11
+  //
+  const std::vector<real_t> &GetVertices() const { return vertices; }
+
+  const std::vector<real_t> &GetVertexWeights() const { return vertex_weights; }
+};
+
+struct callback_t {
+  // W is optional and set to 1 if there is no `w` item in `v` line
+  void (*vertex_cb)(void *user_data, real_t x, real_t y, real_t z, real_t w);
+  void (*normal_cb)(void *user_data, real_t x, real_t y, real_t z);
+
+  // y and z are optional and set to 0 if there is no `y` and/or `z` item(s) in
+  // `vt` line.
+  void (*texcoord_cb)(void *user_data, real_t x, real_t y, real_t z);
+
+  // called per 'f' line. num_indices is the number of face indices(e.g. 3 for
+  // triangle, 4 for quad)
+  // 0 will be passed for undefined index in index_t members.
+  void (*index_cb)(void *user_data, index_t *indices, int num_indices);
+  // `name` material name, `material_id` = the array index of material_t[]. -1
+  // if
+  // a material not found in .mtl
+  void (*usemtl_cb)(void *user_data, const char *name, int material_id);
+  // `materials` = parsed material data.
+  void (*mtllib_cb)(void *user_data, const material_t *materials,
+                    int num_materials);
+  // There may be multiple group names
+  void (*group_cb)(void *user_data, const char **names, int num_names);
+  void (*object_cb)(void *user_data, const char *name);
+
+  callback_t()
+      : vertex_cb(NULL),
+        normal_cb(NULL),
+        texcoord_cb(NULL),
+        index_cb(NULL),
+        usemtl_cb(NULL),
+        mtllib_cb(NULL),
+        group_cb(NULL),
+        object_cb(NULL) {}
+};
+
+class MaterialReader {
+ public:
+  MaterialReader() {}
+  virtual ~MaterialReader();
+
+  virtual bool operator()(const std::string &matId,
+                          std::vector<material_t> *materials,
+                          std::map<std::string, int> *matMap, std::string *warn,
+                          std::string *err) = 0;
+};
+
+///
+/// Read .mtl from a file.
+///
+class MaterialFileReader : public MaterialReader {
+ public:
+  // Path could contain separator(';' in Windows, ':' in Posix)
+  explicit MaterialFileReader(const std::string &mtl_basedir)
+      : m_mtlBaseDir(mtl_basedir) {}
+  virtual ~MaterialFileReader() TINYOBJ_OVERRIDE {}
+  virtual bool operator()(const std::string &matId,
+                          std::vector<material_t> *materials,
+                          std::map<std::string, int> *matMap, std::string *warn,
+                          std::string *err) TINYOBJ_OVERRIDE;
+
+ private:
+  std::string m_mtlBaseDir;
+};
+
+///
+/// Read .mtl from a stream.
+///
+class MaterialStreamReader : public MaterialReader {
+ public:
+  explicit MaterialStreamReader(std::istream &inStream)
+      : m_inStream(inStream) {}
+  virtual ~MaterialStreamReader() TINYOBJ_OVERRIDE {}
+  virtual bool operator()(const std::string &matId,
+                          std::vector<material_t> *materials,
+                          std::map<std::string, int> *matMap, std::string *warn,
+                          std::string *err) TINYOBJ_OVERRIDE;
+
+ private:
+  std::istream &m_inStream;
+};
+
+// v2 API
+struct ObjReaderConfig {
+  bool triangulate;  // triangulate polygon?
+
+  // Currently not used.
+  // "simple" or empty: Create triangle fan
+  // "earcut": Use the algorithm based on Ear clipping
+  std::string triangulation_method;
+
+  /// Parse vertex color.
+  /// If vertex color is not present, its filled with default value.
+  /// false = no vertex color
+  /// This will increase memory of parsed .obj
+  bool vertex_color;
+
+  ///
+  /// Search path to .mtl file.
+  /// Default = "" = search from the same directory of .obj file.
+  /// Valid only when loading .obj from a file.
+  ///
+  std::string mtl_search_path;
+
+  ObjReaderConfig()
+      : triangulate(true), triangulation_method("simple"), vertex_color(true) {}
+};
+
+///
+/// Wavefront .obj reader class(v2 API)
+///
+class ObjReader {
+ public:
+  ObjReader() : valid_(false) {}
+
+  ///
+  /// Load .obj and .mtl from a file.
+  ///
+  /// @param[in] filename wavefront .obj filename
+  /// @param[in] config Reader configuration
+  ///
+  bool ParseFromFile(const std::string &filename,
+                     const ObjReaderConfig &config = ObjReaderConfig());
+
+  ///
+  /// Parse .obj from a text string.
+  /// Need to supply .mtl text string by `mtl_text`.
+  /// This function ignores `mtllib` line in .obj text.
+  ///
+  /// @param[in] obj_text wavefront .obj filename
+  /// @param[in] mtl_text wavefront .mtl filename
+  /// @param[in] config Reader configuration
+  ///
+  bool ParseFromString(const std::string &obj_text, const std::string &mtl_text,
+                       const ObjReaderConfig &config = ObjReaderConfig());
+
+  ///
+  /// .obj was loaded or parsed correctly.
+  ///
+  bool Valid() const { return valid_; }
+
+  const attrib_t &GetAttrib() const { return attrib_; }
+
+  const std::vector<shape_t> &GetShapes() const { return shapes_; }
+
+  const std::vector<material_t> &GetMaterials() const { return materials_; }
+
+  ///
+  /// Warning message(may be filled after `Load` or `Parse`)
+  ///
+  const std::string &Warning() const { return warning_; }
+
+  ///
+  /// Error message(filled when `Load` or `Parse` failed)
+  ///
+  const std::string &Error() const { return error_; }
+
+ private:
+  bool valid_;
+
+  attrib_t attrib_;
+  std::vector<shape_t> shapes_;
+  std::vector<material_t> materials_;
+
+  std::string warning_;
+  std::string error_;
+};
+
+/// ==>>========= Legacy v1 API =============================================
+
+/// Loads .obj from a file.
+/// 'attrib', 'shapes' and 'materials' will be filled with parsed shape data
+/// 'shapes' will be filled with parsed shape data
+/// Returns true when loading .obj become success.
+/// Returns warning message into `warn`, and error message into `err`
+/// 'mtl_basedir' is optional, and used for base directory for .mtl file.
+/// In default(`NULL'), .mtl file is searched from an application's working
+/// directory.
+/// 'triangulate' is optional, and used whether triangulate polygon face in .obj
+/// or not.
+/// Option 'default_vcols_fallback' specifies whether vertex colors should
+/// always be defined, even if no colors are given (fallback to white).
+bool LoadObj(attrib_t *attrib, std::vector<shape_t> *shapes,
+             std::vector<material_t> *materials, std::string *warn,
+             std::string *err, const char *filename,
+             const char *mtl_basedir = NULL, bool triangulate = true,
+             bool default_vcols_fallback = true);
+
+/// Loads .obj from a file with custom user callback.
+/// .mtl is loaded as usual and parsed material_t data will be passed to
+/// `callback.mtllib_cb`.
+/// Returns true when loading .obj/.mtl become success.
+/// Returns warning message into `warn`, and error message into `err`
+/// See `examples/callback_api/` for how to use this function.
+bool LoadObjWithCallback(std::istream &inStream, const callback_t &callback,
+                         void *user_data = NULL,
+                         MaterialReader *readMatFn = NULL,
+                         std::string *warn = NULL, std::string *err = NULL);
+
+/// Loads object from a std::istream, uses `readMatFn` to retrieve
+/// std::istream for materials.
+/// Returns true when loading .obj become success.
+/// Returns warning and error message into `err`
+bool LoadObj(attrib_t *attrib, std::vector<shape_t> *shapes,
+             std::vector<material_t> *materials, std::string *warn,
+             std::string *err, std::istream *inStream,
+             MaterialReader *readMatFn = NULL, bool triangulate = true,
+             bool default_vcols_fallback = true);
+
+/// Loads materials into std::map
+void LoadMtl(std::map<std::string, int> *material_map,
+             std::vector<material_t> *materials, std::istream *inStream,
+             std::string *warning, std::string *err);
+
+///
+/// Parse texture name and texture option for custom texture parameter through
+/// material::unknown_parameter
+///
+/// @param[out] texname Parsed texture name
+/// @param[out] texopt Parsed texopt
+/// @param[in] linebuf Input string
+///
+bool ParseTextureNameAndOption(std::string *texname, texture_option_t *texopt,
+                               const char *linebuf);
+
+/// =<<========== Legacy v1 API =============================================
+
+}  // namespace tinyobj
+
+#endif  // TINY_OBJ_LOADER_H_
+
+#ifdef TINYOBJLOADER_IMPLEMENTATION
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <limits>
+#include <set>
+#include <sstream>
+#include <utility>
+
+#ifdef TINYOBJLOADER_USE_MAPBOX_EARCUT
+
+#ifdef TINYOBJLOADER_DONOT_INCLUDE_MAPBOX_EARCUT
+// Assume earcut.hpp is included outside of tiny_obj_loader.h
+#else
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#endif
+
+#include <array>
+#include "mapbox/earcut.hpp"
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif
+
+#endif  // TINYOBJLOADER_USE_MAPBOX_EARCUT
+
+namespace tinyobj {
+
+MaterialReader::~MaterialReader() {}
+
+struct vertex_index_t {
+  int v_idx, vt_idx, vn_idx;
+  vertex_index_t() : v_idx(-1), vt_idx(-1), vn_idx(-1) {}
+  explicit vertex_index_t(int idx) : v_idx(idx), vt_idx(idx), vn_idx(idx) {}
+  vertex_index_t(int vidx, int vtidx, int vnidx)
+      : v_idx(vidx), vt_idx(vtidx), vn_idx(vnidx) {}
+};
+
+// Internal data structure for face representation
+// index + smoothing group.
+struct face_t {
+  unsigned int
+      smoothing_group_id;  // smoothing group id. 0 = smoothing groupd is off.
+  int pad_;
+  std::vector<vertex_index_t> vertex_indices;  // face vertex indices.
+
+  face_t() : smoothing_group_id(0), pad_(0) {}
+};
+
+// Internal data structure for line representation
+struct __line_t {
+  // l v1/vt1 v2/vt2 ...
+  // In the specification, line primitrive does not have normal index, but
+  // TinyObjLoader allow it
+  std::vector<vertex_index_t> vertex_indices;
+};
+
+// Internal data structure for points representation
+struct __points_t {
+  // p v1 v2 ...
+  // In the specification, point primitrive does not have normal index and
+  // texture coord index, but TinyObjLoader allow it.
+  std::vector<vertex_index_t> vertex_indices;
+};
+
+struct tag_sizes {
+  tag_sizes() : num_ints(0), num_reals(0), num_strings(0) {}
+  int num_ints;
+  int num_reals;
+  int num_strings;
+};
+
+struct obj_shape {
+  std::vector<real_t> v;
+  std::vector<real_t> vn;
+  std::vector<real_t> vt;
+};
+
+//
+// Manages group of primitives(face, line, points, ...)
+struct PrimGroup {
+  std::vector<face_t> faceGroup;
+  std::vector<__line_t> lineGroup;
+  std::vector<__points_t> pointsGroup;
+
+  void clear() {
+    faceGroup.clear();
+    lineGroup.clear();
+    pointsGroup.clear();
+  }
+
+  bool IsEmpty() const {
+    return faceGroup.empty() && lineGroup.empty() && pointsGroup.empty();
+  }
+
+  // TODO(syoyo): bspline, surface, ...
+};
+
+// See
+// http://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf
+static std::istream &safeGetline(std::istream &is, std::string &t) {
+  t.clear();
+
+  // The characters in the stream are read one-by-one using a std::streambuf.
+  // That is faster than reading them one-by-one using the std::istream.
+  // Code that uses streambuf this way must be guarded by a sentry object.
+  // The sentry object performs various tasks,
+  // such as thread synchronization and updating the stream state.
+
+  std::istream::sentry se(is, true);
+  std::streambuf *sb = is.rdbuf();
+
+  if (se) {
+    for (;;) {
+      int c = sb->sbumpc();
+      switch (c) {
+        case '\n':
+          return is;
+        case '\r':
+          if (sb->sgetc() == '\n') sb->sbumpc();
+          return is;
+        case EOF:
+          // Also handle the case when the last line has no line ending
+          if (t.empty()) is.setstate(std::ios::eofbit);
+          return is;
+        default:
+          t += static_cast<char>(c);
+      }
+    }
+  }
+
+  return is;
+}
+
+#define IS_SPACE(x) (((x) == ' ') || ((x) == '\t'))
+#define IS_DIGIT(x) \
+  (static_cast<unsigned int>((x) - '0') < static_cast<unsigned int>(10))
+#define IS_NEW_LINE(x) (((x) == '\r') || ((x) == '\n') || ((x) == '\0'))
+
+// Make index zero-base, and also support relative index.
+static inline bool fixIndex(int idx, int n, int *ret) {
+  if (!ret) {
+    return false;
+  }
+
+  if (idx > 0) {
+    (*ret) = idx - 1;
+    return true;
+  }
+
+  if (idx == 0) {
+    // zero is not allowed according to the spec.
+    return false;
+  }
+
+  if (idx < 0) {
+    (*ret) = n + idx;  // negative value = relative
+    return true;
+  }
+
+  return false;  // never reach here.
+}
+
+static inline std::string parseString(const char **token) {
+  std::string s;
+  (*token) += strspn((*token), " \t");
+  size_t e = strcspn((*token), " \t\r");
+  s = std::string((*token), &(*token)[e]);
+  (*token) += e;
+  return s;
+}
+
+static inline int parseInt(const char **token) {
+  (*token) += strspn((*token), " \t");
+  int i = atoi((*token));
+  (*token) += strcspn((*token), " \t\r");
+  return i;
+}
+
+// Tries to parse a floating point number located at s.
+//
+// s_end should be a location in the string where reading should absolutely
+// stop. For example at the end of the string, to prevent buffer overflows.
+//
+// Parses the following EBNF grammar:
+//   sign    = "+" | "-" ;
+//   END     = ? anything not in digit ?
+//   digit   = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
+//   integer = [sign] , digit , {digit} ;
+//   decimal = integer , ["." , integer] ;
+//   float   = ( decimal , END ) | ( decimal , ("E" | "e") , integer , END ) ;
+//
+//  Valid strings are for example:
+//   -0  +3.1417e+2  -0.0E-3  1.0324  -1.41   11e2
+//
+// If the parsing is a success, result is set to the parsed value and true
+// is returned.
+//
+// The function is greedy and will parse until any of the following happens:
+//  - a non-conforming character is encountered.
+//  - s_end is reached.
+//
+// The following situations triggers a failure:
+//  - s >= s_end.
+//  - parse failure.
+//
+static bool tryParseDouble(const char *s, const char *s_end, double *result) {
+  if (s >= s_end) {
+    return false;
+  }
+
+  double mantissa = 0.0;
+  // This exponent is base 2 rather than 10.
+  // However the exponent we parse is supposed to be one of ten,
+  // thus we must take care to convert the exponent/and or the
+  // mantissa to a * 2^E, where a is the mantissa and E is the
+  // exponent.
+  // To get the final double we will use ldexp, it requires the
+  // exponent to be in base 2.
+  int exponent = 0;
+
+  // NOTE: THESE MUST BE DECLARED HERE SINCE WE ARE NOT ALLOWED
+  // TO JUMP OVER DEFINITIONS.
+  char sign = '+';
+  char exp_sign = '+';
+  char const *curr = s;
+
+  // How many characters were read in a loop.
+  int read = 0;
+  // Tells whether a loop terminated due to reaching s_end.
+  bool end_not_reached = false;
+  bool leading_decimal_dots = false;
+
+  /*
+          BEGIN PARSING.
+  */
+
+  // Find out what sign we've got.
+  if (*curr == '+' || *curr == '-') {
+    sign = *curr;
+    curr++;
+    if ((curr != s_end) && (*curr == '.')) {
+      // accept. Somethig like `.7e+2`, `-.5234`
+      leading_decimal_dots = true;
+    }
+  } else if (IS_DIGIT(*curr)) { /* Pass through. */
+  } else if (*curr == '.') {
+    // accept. Somethig like `.7e+2`, `-.5234`
+    leading_decimal_dots = true;
+  } else {
+    goto fail;
+  }
+
+  // Read the integer part.
+  end_not_reached = (curr != s_end);
+  if (!leading_decimal_dots) {
+    while (end_not_reached && IS_DIGIT(*curr)) {
+      mantissa *= 10;
+      mantissa += static_cast<int>(*curr - 0x30);
+      curr++;
+      read++;
+      end_not_reached = (curr != s_end);
+    }
+
+    // We must make sure we actually got something.
+    if (read == 0) goto fail;
+  }
+
+  // We allow numbers of form "#", "###" etc.
+  if (!end_not_reached) goto assemble;
+
+  // Read the decimal part.
+  if (*curr == '.') {
+    curr++;
+    read = 1;
+    end_not_reached = (curr != s_end);
+    while (end_not_reached && IS_DIGIT(*curr)) {
+      static const double pow_lut[] = {
+          1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001,
+      };
+      const int lut_entries = sizeof pow_lut / sizeof pow_lut[0];
+
+      // NOTE: Don't use powf here, it will absolutely murder precision.
+      mantissa += static_cast<int>(*curr - 0x30) *
+                  (read < lut_entries ? pow_lut[read] : std::pow(10.0, -read));
+      read++;
+      curr++;
+      end_not_reached = (curr != s_end);
+    }
+  } else if (*curr == 'e' || *curr == 'E') {
+  } else {
+    goto assemble;
+  }
+
+  if (!end_not_reached) goto assemble;
+
+  // Read the exponent part.
+  if (*curr == 'e' || *curr == 'E') {
+    curr++;
+    // Figure out if a sign is present and if it is.
+    end_not_reached = (curr != s_end);
+    if (end_not_reached && (*curr == '+' || *curr == '-')) {
+      exp_sign = *curr;
+      curr++;
+    } else if (IS_DIGIT(*curr)) { /* Pass through. */
+    } else {
+      // Empty E is not allowed.
+      goto fail;
+    }
+
+    read = 0;
+    end_not_reached = (curr != s_end);
+    while (end_not_reached && IS_DIGIT(*curr)) {
+      // To avoid annoying MSVC's min/max macro definiton,
+      // Use hardcoded int max value
+      if (exponent > (2147483647/10)) { // 2147483647 = std::numeric_limits<int>::max()
+        // Integer overflow
+        goto fail;
+      }
+      exponent *= 10;
+      exponent += static_cast<int>(*curr - 0x30);
+      curr++;
+      read++;
+      end_not_reached = (curr != s_end);
+    }
+    exponent *= (exp_sign == '+' ? 1 : -1);
+    if (read == 0) goto fail;
+  }
+
+assemble:
+  *result = (sign == '+' ? 1 : -1) *
+            (exponent ? std::ldexp(mantissa * std::pow(5.0, exponent), exponent)
+                      : mantissa);
+  return true;
+fail:
+  return false;
+}
+
+static inline real_t parseReal(const char **token, double default_value = 0.0) {
+  (*token) += strspn((*token), " \t");
+  const char *end = (*token) + strcspn((*token), " \t\r");
+  double val = default_value;
+  tryParseDouble((*token), end, &val);
+  real_t f = static_cast<real_t>(val);
+  (*token) = end;
+  return f;
+}
+
+static inline bool parseReal(const char **token, real_t *out) {
+  (*token) += strspn((*token), " \t");
+  const char *end = (*token) + strcspn((*token), " \t\r");
+  double val;
+  bool ret = tryParseDouble((*token), end, &val);
+  if (ret) {
+    real_t f = static_cast<real_t>(val);
+    (*out) = f;
+  }
+  (*token) = end;
+  return ret;
+}
+
+static inline void parseReal2(real_t *x, real_t *y, const char **token,
+                              const double default_x = 0.0,
+                              const double default_y = 0.0) {
+  (*x) = parseReal(token, default_x);
+  (*y) = parseReal(token, default_y);
+}
+
+static inline void parseReal3(real_t *x, real_t *y, real_t *z,
+                              const char **token, const double default_x = 0.0,
+                              const double default_y = 0.0,
+                              const double default_z = 0.0) {
+  (*x) = parseReal(token, default_x);
+  (*y) = parseReal(token, default_y);
+  (*z) = parseReal(token, default_z);
+}
+
+static inline void parseV(real_t *x, real_t *y, real_t *z, real_t *w,
+                          const char **token, const double default_x = 0.0,
+                          const double default_y = 0.0,
+                          const double default_z = 0.0,
+                          const double default_w = 1.0) {
+  (*x) = parseReal(token, default_x);
+  (*y) = parseReal(token, default_y);
+  (*z) = parseReal(token, default_z);
+  (*w) = parseReal(token, default_w);
+}
+
+// Extension: parse vertex with colors(6 items)
+static inline bool parseVertexWithColor(real_t *x, real_t *y, real_t *z,
+                                        real_t *r, real_t *g, real_t *b,
+                                        const char **token,
+                                        const double default_x = 0.0,
+                                        const double default_y = 0.0,
+                                        const double default_z = 0.0) {
+  (*x) = parseReal(token, default_x);
+  (*y) = parseReal(token, default_y);
+  (*z) = parseReal(token, default_z);
+
+  const bool found_color =
+      parseReal(token, r) && parseReal(token, g) && parseReal(token, b);
+
+  if (!found_color) {
+    (*r) = (*g) = (*b) = 1.0;
+  }
+
+  return found_color;
+}
+
+static inline bool parseOnOff(const char **token, bool default_value = true) {
+  (*token) += strspn((*token), " \t");
+  const char *end = (*token) + strcspn((*token), " \t\r");
+
+  bool ret = default_value;
+  if ((0 == strncmp((*token), "on", 2))) {
+    ret = true;
+  } else if ((0 == strncmp((*token), "off", 3))) {
+    ret = false;
+  }
+
+  (*token) = end;
+  return ret;
+}
+
+static inline texture_type_t parseTextureType(
+    const char **token, texture_type_t default_value = TEXTURE_TYPE_NONE) {
+  (*token) += strspn((*token), " \t");
+  const char *end = (*token) + strcspn((*token), " \t\r");
+  texture_type_t ty = default_value;
+
+  if ((0 == strncmp((*token), "cube_top", strlen("cube_top")))) {
+    ty = TEXTURE_TYPE_CUBE_TOP;
+  } else if ((0 == strncmp((*token), "cube_bottom", strlen("cube_bottom")))) {
+    ty = TEXTURE_TYPE_CUBE_BOTTOM;
+  } else if ((0 == strncmp((*token), "cube_left", strlen("cube_left")))) {
+    ty = TEXTURE_TYPE_CUBE_LEFT;
+  } else if ((0 == strncmp((*token), "cube_right", strlen("cube_right")))) {
+    ty = TEXTURE_TYPE_CUBE_RIGHT;
+  } else if ((0 == strncmp((*token), "cube_front", strlen("cube_front")))) {
+    ty = TEXTURE_TYPE_CUBE_FRONT;
+  } else if ((0 == strncmp((*token), "cube_back", strlen("cube_back")))) {
+    ty = TEXTURE_TYPE_CUBE_BACK;
+  } else if ((0 == strncmp((*token), "sphere", strlen("sphere")))) {
+    ty = TEXTURE_TYPE_SPHERE;
+  }
+
+  (*token) = end;
+  return ty;
+}
+
+static tag_sizes parseTagTriple(const char **token) {
+  tag_sizes ts;
+
+  (*token) += strspn((*token), " \t");
+  ts.num_ints = atoi((*token));
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    return ts;
+  }
+
+  (*token)++;  // Skip '/'
+
+  (*token) += strspn((*token), " \t");
+  ts.num_reals = atoi((*token));
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    return ts;
+  }
+  (*token)++;  // Skip '/'
+
+  ts.num_strings = parseInt(token);
+
+  return ts;
+}
+
+// Parse triples with index offsets: i, i/j/k, i//k, i/j
+static bool parseTriple(const char **token, int vsize, int vnsize, int vtsize,
+                        vertex_index_t *ret) {
+  if (!ret) {
+    return false;
+  }
+
+  vertex_index_t vi(-1);
+
+  if (!fixIndex(atoi((*token)), vsize, &(vi.v_idx))) {
+    return false;
+  }
+
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    (*ret) = vi;
+    return true;
+  }
+  (*token)++;
+
+  // i//k
+  if ((*token)[0] == '/') {
+    (*token)++;
+    if (!fixIndex(atoi((*token)), vnsize, &(vi.vn_idx))) {
+      return false;
+    }
+    (*token) += strcspn((*token), "/ \t\r");
+    (*ret) = vi;
+    return true;
+  }
+
+  // i/j/k or i/j
+  if (!fixIndex(atoi((*token)), vtsize, &(vi.vt_idx))) {
+    return false;
+  }
+
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    (*ret) = vi;
+    return true;
+  }
+
+  // i/j/k
+  (*token)++;  // skip '/'
+  if (!fixIndex(atoi((*token)), vnsize, &(vi.vn_idx))) {
+    return false;
+  }
+  (*token) += strcspn((*token), "/ \t\r");
+
+  (*ret) = vi;
+
+  return true;
+}
+
+// Parse raw triples: i, i/j/k, i//k, i/j
+static vertex_index_t parseRawTriple(const char **token) {
+  vertex_index_t vi(static_cast<int>(0));  // 0 is an invalid index in OBJ
+
+  vi.v_idx = atoi((*token));
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    return vi;
+  }
+  (*token)++;
+
+  // i//k
+  if ((*token)[0] == '/') {
+    (*token)++;
+    vi.vn_idx = atoi((*token));
+    (*token) += strcspn((*token), "/ \t\r");
+    return vi;
+  }
+
+  // i/j/k or i/j
+  vi.vt_idx = atoi((*token));
+  (*token) += strcspn((*token), "/ \t\r");
+  if ((*token)[0] != '/') {
+    return vi;
+  }
+
+  // i/j/k
+  (*token)++;  // skip '/'
+  vi.vn_idx = atoi((*token));
+  (*token) += strcspn((*token), "/ \t\r");
+  return vi;
+}
+
+bool ParseTextureNameAndOption(std::string *texname, texture_option_t *texopt,
+                               const char *linebuf) {
+  // @todo { write more robust lexer and parser. }
+  bool found_texname = false;
+  std::string texture_name;
+
+  const char *token = linebuf;  // Assume line ends with NULL
+
+  while (!IS_NEW_LINE((*token))) {
+    token += strspn(token, " \t");  // skip space
+    if ((0 == strncmp(token, "-blendu", 7)) && IS_SPACE((token[7]))) {
+      token += 8;
+      texopt->blendu = parseOnOff(&token, /* default */ true);
+    } else if ((0 == strncmp(token, "-blendv", 7)) && IS_SPACE((token[7]))) {
+      token += 8;
+      texopt->blendv = parseOnOff(&token, /* default */ true);
+    } else if ((0 == strncmp(token, "-clamp", 6)) && IS_SPACE((token[6]))) {
+      token += 7;
+      texopt->clamp = parseOnOff(&token, /* default */ true);
+    } else if ((0 == strncmp(token, "-boost", 6)) && IS_SPACE((token[6]))) {
+      token += 7;
+      texopt->sharpness = parseReal(&token, 1.0);
+    } else if ((0 == strncmp(token, "-bm", 3)) && IS_SPACE((token[3]))) {
+      token += 4;
+      texopt->bump_multiplier = parseReal(&token, 1.0);
+    } else if ((0 == strncmp(token, "-o", 2)) && IS_SPACE((token[2]))) {
+      token += 3;
+      parseReal3(&(texopt->origin_offset[0]), &(texopt->origin_offset[1]),
+                 &(texopt->origin_offset[2]), &token);
+    } else if ((0 == strncmp(token, "-s", 2)) && IS_SPACE((token[2]))) {
+      token += 3;
+      parseReal3(&(texopt->scale[0]), &(texopt->scale[1]), &(texopt->scale[2]),
+                 &token, 1.0, 1.0, 1.0);
+    } else if ((0 == strncmp(token, "-t", 2)) && IS_SPACE((token[2]))) {
+      token += 3;
+      parseReal3(&(texopt->turbulence[0]), &(texopt->turbulence[1]),
+                 &(texopt->turbulence[2]), &token);
+    } else if ((0 == strncmp(token, "-type", 5)) && IS_SPACE((token[5]))) {
+      token += 5;
+      texopt->type = parseTextureType((&token), TEXTURE_TYPE_NONE);
+    } else if ((0 == strncmp(token, "-texres", 7)) && IS_SPACE((token[7]))) {
+      token += 7;
+      // TODO(syoyo): Check if arg is int type.
+      texopt->texture_resolution = parseInt(&token);
+    } else if ((0 == strncmp(token, "-imfchan", 8)) && IS_SPACE((token[8]))) {
+      token += 9;
+      token += strspn(token, " \t");
+      const char *end = token + strcspn(token, " \t\r");
+      if ((end - token) == 1) {  // Assume one char for -imfchan
+        texopt->imfchan = (*token);
+      }
+      token = end;
+    } else if ((0 == strncmp(token, "-mm", 3)) && IS_SPACE((token[3]))) {
+      token += 4;
+      parseReal2(&(texopt->brightness), &(texopt->contrast), &token, 0.0, 1.0);
+    } else if ((0 == strncmp(token, "-colorspace", 11)) &&
+               IS_SPACE((token[11]))) {
+      token += 12;
+      texopt->colorspace = parseString(&token);
+    } else {
+// Assume texture filename
+#if 0
+      size_t len = strcspn(token, " \t\r");  // untile next space
+      texture_name = std::string(token, token + len);
+      token += len;
+
+      token += strspn(token, " \t");  // skip space
+#else
+      // Read filename until line end to parse filename containing whitespace
+      // TODO(syoyo): Support parsing texture option flag after the filename.
+      texture_name = std::string(token);
+      token += texture_name.length();
+#endif
+
+      found_texname = true;
+    }
+  }
+
+  if (found_texname) {
+    (*texname) = texture_name;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static void InitTexOpt(texture_option_t *texopt, const bool is_bump) {
+  if (is_bump) {
+    texopt->imfchan = 'l';
+  } else {
+    texopt->imfchan = 'm';
+  }
+  texopt->bump_multiplier = static_cast<real_t>(1.0);
+  texopt->clamp = false;
+  texopt->blendu = true;
+  texopt->blendv = true;
+  texopt->sharpness = static_cast<real_t>(1.0);
+  texopt->brightness = static_cast<real_t>(0.0);
+  texopt->contrast = static_cast<real_t>(1.0);
+  texopt->origin_offset[0] = static_cast<real_t>(0.0);
+  texopt->origin_offset[1] = static_cast<real_t>(0.0);
+  texopt->origin_offset[2] = static_cast<real_t>(0.0);
+  texopt->scale[0] = static_cast<real_t>(1.0);
+  texopt->scale[1] = static_cast<real_t>(1.0);
+  texopt->scale[2] = static_cast<real_t>(1.0);
+  texopt->turbulence[0] = static_cast<real_t>(0.0);
+  texopt->turbulence[1] = static_cast<real_t>(0.0);
+  texopt->turbulence[2] = static_cast<real_t>(0.0);
+  texopt->texture_resolution = -1;
+  texopt->type = TEXTURE_TYPE_NONE;
+}
+
+static void InitMaterial(material_t *material) {
+  InitTexOpt(&material->ambient_texopt, /* is_bump */ false);
+  InitTexOpt(&material->diffuse_texopt, /* is_bump */ false);
+  InitTexOpt(&material->specular_texopt, /* is_bump */ false);
+  InitTexOpt(&material->specular_highlight_texopt, /* is_bump */ false);
+  InitTexOpt(&material->bump_texopt, /* is_bump */ true);
+  InitTexOpt(&material->displacement_texopt, /* is_bump */ false);
+  InitTexOpt(&material->alpha_texopt, /* is_bump */ false);
+  InitTexOpt(&material->reflection_texopt, /* is_bump */ false);
+  InitTexOpt(&material->roughness_texopt, /* is_bump */ false);
+  InitTexOpt(&material->metallic_texopt, /* is_bump */ false);
+  InitTexOpt(&material->sheen_texopt, /* is_bump */ false);
+  InitTexOpt(&material->emissive_texopt, /* is_bump */ false);
+  InitTexOpt(&material->normal_texopt,
+             /* is_bump */ false);  // @fixme { is_bump will be true? }
+  material->name = "";
+  material->ambient_texname = "";
+  material->diffuse_texname = "";
+  material->specular_texname = "";
+  material->specular_highlight_texname = "";
+  material->bump_texname = "";
+  material->displacement_texname = "";
+  material->reflection_texname = "";
+  material->alpha_texname = "";
+  for (int i = 0; i < 3; i++) {
+    material->ambient[i] = static_cast<real_t>(0.0);
+    material->diffuse[i] = static_cast<real_t>(0.0);
+    material->specular[i] = static_cast<real_t>(0.0);
+    material->transmittance[i] = static_cast<real_t>(0.0);
+    material->emission[i] = static_cast<real_t>(0.0);
+  }
+  material->illum = 0;
+  material->dissolve = static_cast<real_t>(1.0);
+  material->shininess = static_cast<real_t>(1.0);
+  material->ior = static_cast<real_t>(1.0);
+
+  material->roughness = static_cast<real_t>(0.0);
+  material->metallic = static_cast<real_t>(0.0);
+  material->sheen = static_cast<real_t>(0.0);
+  material->clearcoat_thickness = static_cast<real_t>(0.0);
+  material->clearcoat_roughness = static_cast<real_t>(0.0);
+  material->anisotropy_rotation = static_cast<real_t>(0.0);
+  material->anisotropy = static_cast<real_t>(0.0);
+  material->roughness_texname = "";
+  material->metallic_texname = "";
+  material->sheen_texname = "";
+  material->emissive_texname = "";
+  material->normal_texname = "";
+
+  material->unknown_parameter.clear();
+}
+
+// code from https://wrf.ecse.rpi.edu//Research/Short_Notes/pnpoly.html
+template <typename T>
+static int pnpoly(int nvert, T *vertx, T *verty, T testx, T testy) {
+  int i, j, c = 0;
+  for (i = 0, j = nvert - 1; i < nvert; j = i++) {
+    if (((verty[i] > testy) != (verty[j] > testy)) &&
+        (testx <
+         (vertx[j] - vertx[i]) * (testy - verty[i]) / (verty[j] - verty[i]) +
+             vertx[i]))
+      c = !c;
+  }
+  return c;
+}
+
+// TODO(syoyo): refactor function.
+static bool exportGroupsToShape(shape_t *shape, const PrimGroup &prim_group,
+                                const std::vector<tag_t> &tags,
+                                const int material_id, const std::string &name,
+                                bool triangulate, const std::vector<real_t> &v,
+                                std::string *warn) {
+  if (prim_group.IsEmpty()) {
+    return false;
+  }
+
+  shape->name = name;
+
+  // polygon
+  if (!prim_group.faceGroup.empty()) {
+    // Flatten vertices and indices
+    for (size_t i = 0; i < prim_group.faceGroup.size(); i++) {
+      const face_t &face = prim_group.faceGroup[i];
+
+      size_t npolys = face.vertex_indices.size();
+
+      if (npolys < 3) {
+        // Face must have 3+ vertices.
+        if (warn) {
+          (*warn) += "Degenerated face found\n.";
+        }
+        continue;
+      }
+
+      if (triangulate) {
+        if (npolys == 4) {
+          vertex_index_t i0 = face.vertex_indices[0];
+          vertex_index_t i1 = face.vertex_indices[1];
+          vertex_index_t i2 = face.vertex_indices[2];
+          vertex_index_t i3 = face.vertex_indices[3];
+
+          size_t vi0 = size_t(i0.v_idx);
+          size_t vi1 = size_t(i1.v_idx);
+          size_t vi2 = size_t(i2.v_idx);
+          size_t vi3 = size_t(i3.v_idx);
+
+          if (((3 * vi0 + 2) >= v.size()) || ((3 * vi1 + 2) >= v.size()) ||
+              ((3 * vi2 + 2) >= v.size()) || ((3 * vi3 + 2) >= v.size())) {
+            // Invalid triangle.
+            // FIXME(syoyo): Is it ok to simply skip this invalid triangle?
+            if (warn) {
+              (*warn) += "Face with invalid vertex index found.\n";
+            }
+            continue;
+          }
+
+          real_t v0x = v[vi0 * 3 + 0];
+          real_t v0y = v[vi0 * 3 + 1];
+          real_t v0z = v[vi0 * 3 + 2];
+          real_t v1x = v[vi1 * 3 + 0];
+          real_t v1y = v[vi1 * 3 + 1];
+          real_t v1z = v[vi1 * 3 + 2];
+          real_t v2x = v[vi2 * 3 + 0];
+          real_t v2y = v[vi2 * 3 + 1];
+          real_t v2z = v[vi2 * 3 + 2];
+          real_t v3x = v[vi3 * 3 + 0];
+          real_t v3y = v[vi3 * 3 + 1];
+          real_t v3z = v[vi3 * 3 + 2];
+
+          // There are two candidates to split the quad into two triangles.
+          //
+          // Choose the shortest edge.
+          // TODO: Is it better to determine the edge to split by calculating
+          // the area of each triangle?
+          //
+          // +---+
+          // |\  |
+          // | \ |
+          // |  \|
+          // +---+
+          //
+          // +---+
+          // |  /|
+          // | / |
+          // |/  |
+          // +---+
+
+          real_t e02x = v2x - v0x;
+          real_t e02y = v2y - v0y;
+          real_t e02z = v2z - v0z;
+          real_t e13x = v3x - v1x;
+          real_t e13y = v3y - v1y;
+          real_t e13z = v3z - v1z;
+
+          real_t sqr02 = e02x * e02x + e02y * e02y + e02z * e02z;
+          real_t sqr13 = e13x * e13x + e13y * e13y + e13z * e13z;
+
+          index_t idx0, idx1, idx2, idx3;
+
+          idx0.vertex_index = i0.v_idx;
+          idx0.normal_index = i0.vn_idx;
+          idx0.texcoord_index = i0.vt_idx;
+          idx1.vertex_index = i1.v_idx;
+          idx1.normal_index = i1.vn_idx;
+          idx1.texcoord_index = i1.vt_idx;
+          idx2.vertex_index = i2.v_idx;
+          idx2.normal_index = i2.vn_idx;
+          idx2.texcoord_index = i2.vt_idx;
+          idx3.vertex_index = i3.v_idx;
+          idx3.normal_index = i3.vn_idx;
+          idx3.texcoord_index = i3.vt_idx;
+
+          if (sqr02 < sqr13) {
+            // [0, 1, 2], [0, 2, 3]
+            shape->mesh.indices.push_back(idx0);
+            shape->mesh.indices.push_back(idx1);
+            shape->mesh.indices.push_back(idx2);
+
+            shape->mesh.indices.push_back(idx0);
+            shape->mesh.indices.push_back(idx2);
+            shape->mesh.indices.push_back(idx3);
+          } else {
+            // [0, 1, 3], [1, 2, 3]
+            shape->mesh.indices.push_back(idx0);
+            shape->mesh.indices.push_back(idx1);
+            shape->mesh.indices.push_back(idx3);
+
+            shape->mesh.indices.push_back(idx1);
+            shape->mesh.indices.push_back(idx2);
+            shape->mesh.indices.push_back(idx3);
+          }
+
+          // Two triangle faces
+          shape->mesh.num_face_vertices.push_back(3);
+          shape->mesh.num_face_vertices.push_back(3);
+
+          shape->mesh.material_ids.push_back(material_id);
+          shape->mesh.material_ids.push_back(material_id);
+
+          shape->mesh.smoothing_group_ids.push_back(face.smoothing_group_id);
+          shape->mesh.smoothing_group_ids.push_back(face.smoothing_group_id);
+
+        } else {
+          vertex_index_t i0 = face.vertex_indices[0];
+          vertex_index_t i1(-1);
+          vertex_index_t i2 = face.vertex_indices[1];
+
+          // find the two axes to work in
+          size_t axes[2] = {1, 2};
+          for (size_t k = 0; k < npolys; ++k) {
+            i0 = face.vertex_indices[(k + 0) % npolys];
+            i1 = face.vertex_indices[(k + 1) % npolys];
+            i2 = face.vertex_indices[(k + 2) % npolys];
+            size_t vi0 = size_t(i0.v_idx);
+            size_t vi1 = size_t(i1.v_idx);
+            size_t vi2 = size_t(i2.v_idx);
+
+            if (((3 * vi0 + 2) >= v.size()) || ((3 * vi1 + 2) >= v.size()) ||
+                ((3 * vi2 + 2) >= v.size())) {
+              // Invalid triangle.
+              // FIXME(syoyo): Is it ok to simply skip this invalid triangle?
+              continue;
+            }
+            real_t v0x = v[vi0 * 3 + 0];
+            real_t v0y = v[vi0 * 3 + 1];
+            real_t v0z = v[vi0 * 3 + 2];
+            real_t v1x = v[vi1 * 3 + 0];
+            real_t v1y = v[vi1 * 3 + 1];
+            real_t v1z = v[vi1 * 3 + 2];
+            real_t v2x = v[vi2 * 3 + 0];
+            real_t v2y = v[vi2 * 3 + 1];
+            real_t v2z = v[vi2 * 3 + 2];
+            real_t e0x = v1x - v0x;
+            real_t e0y = v1y - v0y;
+            real_t e0z = v1z - v0z;
+            real_t e1x = v2x - v1x;
+            real_t e1y = v2y - v1y;
+            real_t e1z = v2z - v1z;
+            real_t cx = std::fabs(e0y * e1z - e0z * e1y);
+            real_t cy = std::fabs(e0z * e1x - e0x * e1z);
+            real_t cz = std::fabs(e0x * e1y - e0y * e1x);
+            const real_t epsilon = std::numeric_limits<real_t>::epsilon();
+            // std::cout << "cx " << cx << ", cy " << cy << ", cz " << cz <<
+            // "\n";
+            if (cx > epsilon || cy > epsilon || cz > epsilon) {
+              // std::cout << "corner\n";
+              // found a corner
+              if (cx > cy && cx > cz) {
+                // std::cout << "pattern0\n";
+              } else {
+                // std::cout << "axes[0] = 0\n";
+                axes[0] = 0;
+                if (cz > cx && cz > cy) {
+                  // std::cout << "axes[1] = 1\n";
+                  axes[1] = 1;
+                }
+              }
+              break;
+            }
+          }
+
+#ifdef TINYOBJLOADER_USE_MAPBOX_EARCUT
+          using Point = std::array<real_t, 2>;
+
+          // first polyline define the main polygon.
+          // following polylines define holes(not used in tinyobj).
+          std::vector<std::vector<Point> > polygon;
+
+          std::vector<Point> polyline;
+
+          // Fill polygon data(facevarying vertices).
+          for (size_t k = 0; k < npolys; k++) {
+            i0 = face.vertex_indices[k];
+            size_t vi0 = size_t(i0.v_idx);
+
+            assert(((3 * vi0 + 2) < v.size()));
+
+            real_t v0x = v[vi0 * 3 + axes[0]];
+            real_t v0y = v[vi0 * 3 + axes[1]];
+
+            polyline.push_back({v0x, v0y});
+          }
+
+          polygon.push_back(polyline);
+          std::vector<uint32_t> indices = mapbox::earcut<uint32_t>(polygon);
+          // => result = 3 * faces, clockwise
+
+          assert(indices.size() % 3 == 0);
+
+          // Reconstruct vertex_index_t
+          for (size_t k = 0; k < indices.size() / 3; k++) {
+            {
+              index_t idx0, idx1, idx2;
+              idx0.vertex_index = face.vertex_indices[indices[3 * k + 0]].v_idx;
+              idx0.normal_index =
+                  face.vertex_indices[indices[3 * k + 0]].vn_idx;
+              idx0.texcoord_index =
+                  face.vertex_indices[indices[3 * k + 0]].vt_idx;
+              idx1.vertex_index = face.vertex_indices[indices[3 * k + 1]].v_idx;
+              idx1.normal_index =
+                  face.vertex_indices[indices[3 * k + 1]].vn_idx;
+              idx1.texcoord_index =
+                  face.vertex_indices[indices[3 * k + 1]].vt_idx;
+              idx2.vertex_index = face.vertex_indices[indices[3 * k + 2]].v_idx;
+              idx2.normal_index =
+                  face.vertex_indices[indices[3 * k + 2]].vn_idx;
+              idx2.texcoord_index =
+                  face.vertex_indices[indices[3 * k + 2]].vt_idx;
+
+              shape->mesh.indices.push_back(idx0);
+              shape->mesh.indices.push_back(idx1);
+              shape->mesh.indices.push_back(idx2);
+
+              shape->mesh.num_face_vertices.push_back(3);
+              shape->mesh.material_ids.push_back(material_id);
+              shape->mesh.smoothing_group_ids.push_back(
+                  face.smoothing_group_id);
+            }
+          }
+
+#else  // Built-in ear clipping triangulation
+
+
+          face_t remainingFace = face;  // copy
+          size_t guess_vert = 0;
+          vertex_index_t ind[3];
+          real_t vx[3];
+          real_t vy[3];
+
+          // How many iterations can we do without decreasing the remaining
+          // vertices.
+          size_t remainingIterations = face.vertex_indices.size();
+          size_t previousRemainingVertices =
+              remainingFace.vertex_indices.size();
+
+          while (remainingFace.vertex_indices.size() > 3 &&
+                 remainingIterations > 0) {
+            // std::cout << "remainingIterations " << remainingIterations <<
+            // "\n";
+
+            npolys = remainingFace.vertex_indices.size();
+            if (guess_vert >= npolys) {
+              guess_vert -= npolys;
+            }
+
+            if (previousRemainingVertices != npolys) {
+              // The number of remaining vertices decreased. Reset counters.
+              previousRemainingVertices = npolys;
+              remainingIterations = npolys;
+            } else {
+              // We didn't consume a vertex on previous iteration, reduce the
+              // available iterations.
+              remainingIterations--;
+            }
+
+            for (size_t k = 0; k < 3; k++) {
+              ind[k] = remainingFace.vertex_indices[(guess_vert + k) % npolys];
+              size_t vi = size_t(ind[k].v_idx);
+              if (((vi * 3 + axes[0]) >= v.size()) ||
+                  ((vi * 3 + axes[1]) >= v.size())) {
+                // ???
+                vx[k] = static_cast<real_t>(0.0);
+                vy[k] = static_cast<real_t>(0.0);
+              } else {
+                vx[k] = v[vi * 3 + axes[0]];
+                vy[k] = v[vi * 3 + axes[1]];
+              }
+            }
+
+            //
+            // area is calculated per face
+            //
+            real_t e0x = vx[1] - vx[0];
+            real_t e0y = vy[1] - vy[0];
+            real_t e1x = vx[2] - vx[1];
+            real_t e1y = vy[2] - vy[1];
+            real_t cross = e0x * e1y - e0y * e1x;
+            // std::cout << "axes = " << axes[0] << ", " << axes[1] << "\n";
+            // std::cout << "e0x, e0y, e1x, e1y " << e0x << ", " << e0y << ", "
+            // << e1x << ", " << e1y << "\n";
+
+            real_t area = (vx[0] * vy[1] - vy[0] * vx[1]) * static_cast<real_t>(0.5);
+            // std::cout << "cross " << cross << ", area " << area << "\n";
+            // if an internal angle
+            if (cross * area < static_cast<real_t>(0.0)) {
+              // std::cout << "internal \n";
+              guess_vert += 1;
+              // std::cout << "guess vert : " << guess_vert << "\n";
+              continue;
+            }
+
+            // check all other verts in case they are inside this triangle
+            bool overlap = false;
+            for (size_t otherVert = 3; otherVert < npolys; ++otherVert) {
+              size_t idx = (guess_vert + otherVert) % npolys;
+
+              if (idx >= remainingFace.vertex_indices.size()) {
+                // std::cout << "???0\n";
+                // ???
+                continue;
+              }
+
+              size_t ovi = size_t(remainingFace.vertex_indices[idx].v_idx);
+
+              if (((ovi * 3 + axes[0]) >= v.size()) ||
+                  ((ovi * 3 + axes[1]) >= v.size())) {
+                // std::cout << "???1\n";
+                // ???
+                continue;
+              }
+              real_t tx = v[ovi * 3 + axes[0]];
+              real_t ty = v[ovi * 3 + axes[1]];
+              if (pnpoly(3, vx, vy, tx, ty)) {
+                // std::cout << "overlap\n";
+                overlap = true;
+                break;
+              }
+            }
+
+            if (overlap) {
+              // std::cout << "overlap2\n";
+              guess_vert += 1;
+              continue;
+            }
+
+            // this triangle is an ear
+            {
+              index_t idx0, idx1, idx2;
+              idx0.vertex_index = ind[0].v_idx;
+              idx0.normal_index = ind[0].vn_idx;
+              idx0.texcoord_index = ind[0].vt_idx;
+              idx1.vertex_index = ind[1].v_idx;
+              idx1.normal_index = ind[1].vn_idx;
+              idx1.texcoord_index = ind[1].vt_idx;
+              idx2.vertex_index = ind[2].v_idx;
+              idx2.normal_index = ind[2].vn_idx;
+              idx2.texcoord_index = ind[2].vt_idx;
+
+              shape->mesh.indices.push_back(idx0);
+              shape->mesh.indices.push_back(idx1);
+              shape->mesh.indices.push_back(idx2);
+
+              shape->mesh.num_face_vertices.push_back(3);
+              shape->mesh.material_ids.push_back(material_id);
+              shape->mesh.smoothing_group_ids.push_back(
+                  face.smoothing_group_id);
+            }
+
+            // remove v1 from the list
+            size_t removed_vert_index = (guess_vert + 1) % npolys;
+            while (removed_vert_index + 1 < npolys) {
+              remainingFace.vertex_indices[removed_vert_index] =
+                  remainingFace.vertex_indices[removed_vert_index + 1];
+              removed_vert_index += 1;
+            }
+            remainingFace.vertex_indices.pop_back();
+          }
+
+          // std::cout << "remainingFace.vi.size = " <<
+          // remainingFace.vertex_indices.size() << "\n";
+          if (remainingFace.vertex_indices.size() == 3) {
+            i0 = remainingFace.vertex_indices[0];
+            i1 = remainingFace.vertex_indices[1];
+            i2 = remainingFace.vertex_indices[2];
+            {
+              index_t idx0, idx1, idx2;
+              idx0.vertex_index = i0.v_idx;
+              idx0.normal_index = i0.vn_idx;
+              idx0.texcoord_index = i0.vt_idx;
+              idx1.vertex_index = i1.v_idx;
+              idx1.normal_index = i1.vn_idx;
+              idx1.texcoord_index = i1.vt_idx;
+              idx2.vertex_index = i2.v_idx;
+              idx2.normal_index = i2.vn_idx;
+              idx2.texcoord_index = i2.vt_idx;
+
+              shape->mesh.indices.push_back(idx0);
+              shape->mesh.indices.push_back(idx1);
+              shape->mesh.indices.push_back(idx2);
+
+              shape->mesh.num_face_vertices.push_back(3);
+              shape->mesh.material_ids.push_back(material_id);
+              shape->mesh.smoothing_group_ids.push_back(
+                  face.smoothing_group_id);
+            }
+          }
+#endif
+        }  // npolys
+      } else {
+        for (size_t k = 0; k < npolys; k++) {
+          index_t idx;
+          idx.vertex_index = face.vertex_indices[k].v_idx;
+          idx.normal_index = face.vertex_indices[k].vn_idx;
+          idx.texcoord_index = face.vertex_indices[k].vt_idx;
+          shape->mesh.indices.push_back(idx);
+        }
+
+        shape->mesh.num_face_vertices.push_back(
+            static_cast<unsigned char>(npolys));
+        shape->mesh.material_ids.push_back(material_id);  // per face
+        shape->mesh.smoothing_group_ids.push_back(
+            face.smoothing_group_id);  // per face
+      }
+    }
+
+    shape->mesh.tags = tags;
+  }
+
+  // line
+  if (!prim_group.lineGroup.empty()) {
+    // Flatten indices
+    for (size_t i = 0; i < prim_group.lineGroup.size(); i++) {
+      for (size_t j = 0; j < prim_group.lineGroup[i].vertex_indices.size();
+           j++) {
+        const vertex_index_t &vi = prim_group.lineGroup[i].vertex_indices[j];
+
+        index_t idx;
+        idx.vertex_index = vi.v_idx;
+        idx.normal_index = vi.vn_idx;
+        idx.texcoord_index = vi.vt_idx;
+
+        shape->lines.indices.push_back(idx);
+      }
+
+      shape->lines.num_line_vertices.push_back(
+          int(prim_group.lineGroup[i].vertex_indices.size()));
+    }
+  }
+
+  // points
+  if (!prim_group.pointsGroup.empty()) {
+    // Flatten & convert indices
+    for (size_t i = 0; i < prim_group.pointsGroup.size(); i++) {
+      for (size_t j = 0; j < prim_group.pointsGroup[i].vertex_indices.size();
+           j++) {
+        const vertex_index_t &vi = prim_group.pointsGroup[i].vertex_indices[j];
+
+        index_t idx;
+        idx.vertex_index = vi.v_idx;
+        idx.normal_index = vi.vn_idx;
+        idx.texcoord_index = vi.vt_idx;
+
+        shape->points.indices.push_back(idx);
+      }
+    }
+  }
+
+  return true;
+}
+
+// Split a string with specified delimiter character and escape character.
+// https://rosettacode.org/wiki/Tokenize_a_string_with_escaping#C.2B.2B
+static void SplitString(const std::string &s, char delim, char escape,
+                        std::vector<std::string> &elems) {
+  std::string token;
+
+  bool escaping = false;
+  for (size_t i = 0; i < s.size(); ++i) {
+    char ch = s[i];
+    if (escaping) {
+      escaping = false;
+    } else if (ch == escape) {
+      escaping = true;
+      continue;
+    } else if (ch == delim) {
+      if (!token.empty()) {
+        elems.push_back(token);
+      }
+      token.clear();
+      continue;
+    }
+    token += ch;
+  }
+
+  elems.push_back(token);
+}
+
+static std::string JoinPath(const std::string &dir,
+                            const std::string &filename) {
+  if (dir.empty()) {
+    return filename;
+  } else {
+    // check '/'
+    char lastChar = *dir.rbegin();
+    if (lastChar != '/') {
+      return dir + std::string("/") + filename;
+    } else {
+      return dir + filename;
+    }
+  }
+}
+
+void LoadMtl(std::map<std::string, int> *material_map,
+             std::vector<material_t> *materials, std::istream *inStream,
+             std::string *warning, std::string *err) {
+  (void)err;
+
+  // Create a default material anyway.
+  material_t material;
+  InitMaterial(&material);
+
+  // Issue 43. `d` wins against `Tr` since `Tr` is not in the MTL specification.
+  bool has_d = false;
+  bool has_tr = false;
+
+  // has_kd is used to set a default diffuse value when map_Kd is present
+  // and Kd is not.
+  bool has_kd = false;
+
+  std::stringstream warn_ss;
+
+  size_t line_no = 0;
+  std::string linebuf;
+  while (inStream->peek() != -1) {
+    safeGetline(*inStream, linebuf);
+    line_no++;
+
+    // Trim trailing whitespace.
+    if (linebuf.size() > 0) {
+      linebuf = linebuf.substr(0, linebuf.find_last_not_of(" \t") + 1);
+    }
+
+    // Trim newline '\r\n' or '\n'
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\n')
+        linebuf.erase(linebuf.size() - 1);
+    }
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\r')
+        linebuf.erase(linebuf.size() - 1);
+    }
+
+    // Skip if empty line.
+    if (linebuf.empty()) {
+      continue;
+    }
+
+    // Skip leading space.
+    const char *token = linebuf.c_str();
+    token += strspn(token, " \t");
+
+    assert(token);
+    if (token[0] == '\0') continue;  // empty line
+
+    if (token[0] == '#') continue;  // comment line
+
+    // new mtl
+    if ((0 == strncmp(token, "newmtl", 6)) && IS_SPACE((token[6]))) {
+      // flush previous material.
+      if (!material.name.empty()) {
+        material_map->insert(std::pair<std::string, int>(
+            material.name, static_cast<int>(materials->size())));
+        materials->push_back(material);
+      }
+
+      // initial temporary material
+      InitMaterial(&material);
+
+      has_d = false;
+      has_tr = false;
+
+      // set new mtl name
+      token += 7;
+      {
+        std::stringstream sstr;
+        sstr << token;
+        material.name = sstr.str();
+      }
+      continue;
+    }
+
+    // ambient
+    if (token[0] == 'K' && token[1] == 'a' && IS_SPACE((token[2]))) {
+      token += 2;
+      real_t r, g, b;
+      parseReal3(&r, &g, &b, &token);
+      material.ambient[0] = r;
+      material.ambient[1] = g;
+      material.ambient[2] = b;
+      continue;
+    }
+
+    // diffuse
+    if (token[0] == 'K' && token[1] == 'd' && IS_SPACE((token[2]))) {
+      token += 2;
+      real_t r, g, b;
+      parseReal3(&r, &g, &b, &token);
+      material.diffuse[0] = r;
+      material.diffuse[1] = g;
+      material.diffuse[2] = b;
+      has_kd = true;
+      continue;
+    }
+
+    // specular
+    if (token[0] == 'K' && token[1] == 's' && IS_SPACE((token[2]))) {
+      token += 2;
+      real_t r, g, b;
+      parseReal3(&r, &g, &b, &token);
+      material.specular[0] = r;
+      material.specular[1] = g;
+      material.specular[2] = b;
+      continue;
+    }
+
+    // transmittance
+    if ((token[0] == 'K' && token[1] == 't' && IS_SPACE((token[2]))) ||
+        (token[0] == 'T' && token[1] == 'f' && IS_SPACE((token[2])))) {
+      token += 2;
+      real_t r, g, b;
+      parseReal3(&r, &g, &b, &token);
+      material.transmittance[0] = r;
+      material.transmittance[1] = g;
+      material.transmittance[2] = b;
+      continue;
+    }
+
+    // ior(index of refraction)
+    if (token[0] == 'N' && token[1] == 'i' && IS_SPACE((token[2]))) {
+      token += 2;
+      material.ior = parseReal(&token);
+      continue;
+    }
+
+    // emission
+    if (token[0] == 'K' && token[1] == 'e' && IS_SPACE(token[2])) {
+      token += 2;
+      real_t r, g, b;
+      parseReal3(&r, &g, &b, &token);
+      material.emission[0] = r;
+      material.emission[1] = g;
+      material.emission[2] = b;
+      continue;
+    }
+
+    // shininess
+    if (token[0] == 'N' && token[1] == 's' && IS_SPACE(token[2])) {
+      token += 2;
+      material.shininess = parseReal(&token);
+      continue;
+    }
+
+    // illum model
+    if (0 == strncmp(token, "illum", 5) && IS_SPACE(token[5])) {
+      token += 6;
+      material.illum = parseInt(&token);
+      continue;
+    }
+
+    // dissolve
+    if ((token[0] == 'd' && IS_SPACE(token[1]))) {
+      token += 1;
+      material.dissolve = parseReal(&token);
+
+      if (has_tr) {
+        warn_ss << "Both `d` and `Tr` parameters defined for \""
+                << material.name
+                << "\". Use the value of `d` for dissolve (line " << line_no
+                << " in .mtl.)\n";
+      }
+      has_d = true;
+      continue;
+    }
+    if (token[0] == 'T' && token[1] == 'r' && IS_SPACE(token[2])) {
+      token += 2;
+      if (has_d) {
+        // `d` wins. Ignore `Tr` value.
+        warn_ss << "Both `d` and `Tr` parameters defined for \""
+                << material.name
+                << "\". Use the value of `d` for dissolve (line " << line_no
+                << " in .mtl.)\n";
+      } else {
+        // We invert value of Tr(assume Tr is in range [0, 1])
+        // NOTE: Interpretation of Tr is application(exporter) dependent. For
+        // some application(e.g. 3ds max obj exporter), Tr = d(Issue 43)
+        material.dissolve = static_cast<real_t>(1.0) - parseReal(&token);
+      }
+      has_tr = true;
+      continue;
+    }
+
+    // PBR: roughness
+    if (token[0] == 'P' && token[1] == 'r' && IS_SPACE(token[2])) {
+      token += 2;
+      material.roughness = parseReal(&token);
+      continue;
+    }
+
+    // PBR: metallic
+    if (token[0] == 'P' && token[1] == 'm' && IS_SPACE(token[2])) {
+      token += 2;
+      material.metallic = parseReal(&token);
+      continue;
+    }
+
+    // PBR: sheen
+    if (token[0] == 'P' && token[1] == 's' && IS_SPACE(token[2])) {
+      token += 2;
+      material.sheen = parseReal(&token);
+      continue;
+    }
+
+    // PBR: clearcoat thickness
+    if (token[0] == 'P' && token[1] == 'c' && IS_SPACE(token[2])) {
+      token += 2;
+      material.clearcoat_thickness = parseReal(&token);
+      continue;
+    }
+
+    // PBR: clearcoat roughness
+    if ((0 == strncmp(token, "Pcr", 3)) && IS_SPACE(token[3])) {
+      token += 4;
+      material.clearcoat_roughness = parseReal(&token);
+      continue;
+    }
+
+    // PBR: anisotropy
+    if ((0 == strncmp(token, "aniso", 5)) && IS_SPACE(token[5])) {
+      token += 6;
+      material.anisotropy = parseReal(&token);
+      continue;
+    }
+
+    // PBR: anisotropy rotation
+    if ((0 == strncmp(token, "anisor", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      material.anisotropy_rotation = parseReal(&token);
+      continue;
+    }
+
+    // ambient texture
+    if ((0 == strncmp(token, "map_Ka", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.ambient_texname),
+                                &(material.ambient_texopt), token);
+      continue;
+    }
+
+    // diffuse texture
+    if ((0 == strncmp(token, "map_Kd", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.diffuse_texname),
+                                &(material.diffuse_texopt), token);
+
+      // Set a decent diffuse default value if a diffuse texture is specified
+      // without a matching Kd value.
+      if (!has_kd) {
+        material.diffuse[0] = static_cast<real_t>(0.6);
+        material.diffuse[1] = static_cast<real_t>(0.6);
+        material.diffuse[2] = static_cast<real_t>(0.6);
+      }
+
+      continue;
+    }
+
+    // specular texture
+    if ((0 == strncmp(token, "map_Ks", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.specular_texname),
+                                &(material.specular_texopt), token);
+      continue;
+    }
+
+    // specular highlight texture
+    if ((0 == strncmp(token, "map_Ns", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.specular_highlight_texname),
+                                &(material.specular_highlight_texopt), token);
+      continue;
+    }
+
+    // bump texture
+    if ((0 == strncmp(token, "map_bump", 8)) && IS_SPACE(token[8])) {
+      token += 9;
+      ParseTextureNameAndOption(&(material.bump_texname),
+                                &(material.bump_texopt), token);
+      continue;
+    }
+
+    // bump texture
+    if ((0 == strncmp(token, "map_Bump", 8)) && IS_SPACE(token[8])) {
+      token += 9;
+      ParseTextureNameAndOption(&(material.bump_texname),
+                                &(material.bump_texopt), token);
+      continue;
+    }
+
+    // bump texture
+    if ((0 == strncmp(token, "bump", 4)) && IS_SPACE(token[4])) {
+      token += 5;
+      ParseTextureNameAndOption(&(material.bump_texname),
+                                &(material.bump_texopt), token);
+      continue;
+    }
+
+    // alpha texture
+    if ((0 == strncmp(token, "map_d", 5)) && IS_SPACE(token[5])) {
+      token += 6;
+      material.alpha_texname = token;
+      ParseTextureNameAndOption(&(material.alpha_texname),
+                                &(material.alpha_texopt), token);
+      continue;
+    }
+
+    // displacement texture
+    if ((0 == strncmp(token, "disp", 4)) && IS_SPACE(token[4])) {
+      token += 5;
+      ParseTextureNameAndOption(&(material.displacement_texname),
+                                &(material.displacement_texopt), token);
+      continue;
+    }
+
+    // reflection map
+    if ((0 == strncmp(token, "refl", 4)) && IS_SPACE(token[4])) {
+      token += 5;
+      ParseTextureNameAndOption(&(material.reflection_texname),
+                                &(material.reflection_texopt), token);
+      continue;
+    }
+
+    // PBR: roughness texture
+    if ((0 == strncmp(token, "map_Pr", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.roughness_texname),
+                                &(material.roughness_texopt), token);
+      continue;
+    }
+
+    // PBR: metallic texture
+    if ((0 == strncmp(token, "map_Pm", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.metallic_texname),
+                                &(material.metallic_texopt), token);
+      continue;
+    }
+
+    // PBR: sheen texture
+    if ((0 == strncmp(token, "map_Ps", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.sheen_texname),
+                                &(material.sheen_texopt), token);
+      continue;
+    }
+
+    // PBR: emissive texture
+    if ((0 == strncmp(token, "map_Ke", 6)) && IS_SPACE(token[6])) {
+      token += 7;
+      ParseTextureNameAndOption(&(material.emissive_texname),
+                                &(material.emissive_texopt), token);
+      continue;
+    }
+
+    // PBR: normal map texture
+    if ((0 == strncmp(token, "norm", 4)) && IS_SPACE(token[4])) {
+      token += 5;
+      ParseTextureNameAndOption(&(material.normal_texname),
+                                &(material.normal_texopt), token);
+      continue;
+    }
+
+    // unknown parameter
+    const char *_space = strchr(token, ' ');
+    if (!_space) {
+      _space = strchr(token, '\t');
+    }
+    if (_space) {
+      std::ptrdiff_t len = _space - token;
+      std::string key(token, static_cast<size_t>(len));
+      std::string value = _space + 1;
+      material.unknown_parameter.insert(
+          std::pair<std::string, std::string>(key, value));
+    }
+  }
+  // flush last material.
+  material_map->insert(std::pair<std::string, int>(
+      material.name, static_cast<int>(materials->size())));
+  materials->push_back(material);
+
+  if (warning) {
+    (*warning) = warn_ss.str();
+  }
+}
+
+bool MaterialFileReader::operator()(const std::string &matId,
+                                    std::vector<material_t> *materials,
+                                    std::map<std::string, int> *matMap,
+                                    std::string *warn, std::string *err) {
+  if (!m_mtlBaseDir.empty()) {
+#ifdef _WIN32
+    char sep = ';';
+#else
+    char sep = ':';
+#endif
+
+    // https://stackoverflow.com/questions/5167625/splitting-a-c-stdstring-using-tokens-e-g
+    std::vector<std::string> paths;
+    std::istringstream f(m_mtlBaseDir);
+
+    std::string s;
+    while (getline(f, s, sep)) {
+      paths.push_back(s);
+    }
+
+    for (size_t i = 0; i < paths.size(); i++) {
+      std::string filepath = JoinPath(paths[i], matId);
+
+      std::ifstream matIStream(filepath.c_str());
+      if (matIStream) {
+        LoadMtl(matMap, materials, &matIStream, warn, err);
+
+        return true;
+      }
+    }
+
+    std::stringstream ss;
+    ss << "Material file [ " << matId
+       << " ] not found in a path : " << m_mtlBaseDir << "\n";
+    if (warn) {
+      (*warn) += ss.str();
+    }
+    return false;
+
+  } else {
+    std::string filepath = matId;
+    std::ifstream matIStream(filepath.c_str());
+    if (matIStream) {
+      LoadMtl(matMap, materials, &matIStream, warn, err);
+
+      return true;
+    }
+
+    std::stringstream ss;
+    ss << "Material file [ " << filepath
+       << " ] not found in a path : " << m_mtlBaseDir << "\n";
+    if (warn) {
+      (*warn) += ss.str();
+    }
+
+    return false;
+  }
+}
+
+bool MaterialStreamReader::operator()(const std::string &matId,
+                                      std::vector<material_t> *materials,
+                                      std::map<std::string, int> *matMap,
+                                      std::string *warn, std::string *err) {
+  (void)err;
+  (void)matId;
+  if (!m_inStream) {
+    std::stringstream ss;
+    ss << "Material stream in error state. \n";
+    if (warn) {
+      (*warn) += ss.str();
+    }
+    return false;
+  }
+
+  LoadMtl(matMap, materials, &m_inStream, warn, err);
+
+  return true;
+}
+
+bool LoadObj(attrib_t *attrib, std::vector<shape_t> *shapes,
+             std::vector<material_t> *materials, std::string *warn,
+             std::string *err, const char *filename, const char *mtl_basedir,
+             bool triangulate, bool default_vcols_fallback) {
+  attrib->vertices.clear();
+  attrib->normals.clear();
+  attrib->texcoords.clear();
+  attrib->colors.clear();
+  shapes->clear();
+
+  std::stringstream errss;
+
+  std::ifstream ifs(filename);
+  if (!ifs) {
+    errss << "Cannot open file [" << filename << "]\n";
+    if (err) {
+      (*err) = errss.str();
+    }
+    return false;
+  }
+
+  std::string baseDir = mtl_basedir ? mtl_basedir : "";
+  if (!baseDir.empty()) {
+#ifndef _WIN32
+    const char dirsep = '/';
+#else
+    const char dirsep = '\\';
+#endif
+    if (baseDir[baseDir.length() - 1] != dirsep) baseDir += dirsep;
+  }
+  MaterialFileReader matFileReader(baseDir);
+
+  return LoadObj(attrib, shapes, materials, warn, err, &ifs, &matFileReader,
+                 triangulate, default_vcols_fallback);
+}
+
+bool LoadObj(attrib_t *attrib, std::vector<shape_t> *shapes,
+             std::vector<material_t> *materials, std::string *warn,
+             std::string *err, std::istream *inStream,
+             MaterialReader *readMatFn /*= NULL*/, bool triangulate,
+             bool default_vcols_fallback) {
+  std::stringstream errss;
+
+  std::vector<real_t> v;
+  std::vector<real_t> vn;
+  std::vector<real_t> vt;
+  std::vector<real_t> vc;
+  std::vector<skin_weight_t> vw;
+  std::vector<tag_t> tags;
+  PrimGroup prim_group;
+  std::string name;
+
+  // material
+  std::set<std::string> material_filenames;
+  std::map<std::string, int> material_map;
+  int material = -1;
+
+  // smoothing group id
+  unsigned int current_smoothing_id =
+      0;  // Initial value. 0 means no smoothing.
+
+  int greatest_v_idx = -1;
+  int greatest_vn_idx = -1;
+  int greatest_vt_idx = -1;
+
+  shape_t shape;
+
+  bool found_all_colors = true;
+
+  size_t line_num = 0;
+  std::string linebuf;
+  while (inStream->peek() != -1) {
+    safeGetline(*inStream, linebuf);
+
+    line_num++;
+
+    // Trim newline '\r\n' or '\n'
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\n')
+        linebuf.erase(linebuf.size() - 1);
+    }
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\r')
+        linebuf.erase(linebuf.size() - 1);
+    }
+
+    // Skip if empty line.
+    if (linebuf.empty()) {
+      continue;
+    }
+
+    // Skip leading space.
+    const char *token = linebuf.c_str();
+    token += strspn(token, " \t");
+
+    assert(token);
+    if (token[0] == '\0') continue;  // empty line
+
+    if (token[0] == '#') continue;  // comment line
+
+    // vertex
+    if (token[0] == 'v' && IS_SPACE((token[1]))) {
+      token += 2;
+      real_t x, y, z;
+      real_t r, g, b;
+
+      found_all_colors &= parseVertexWithColor(&x, &y, &z, &r, &g, &b, &token);
+
+      v.push_back(x);
+      v.push_back(y);
+      v.push_back(z);
+
+      if (found_all_colors || default_vcols_fallback) {
+        vc.push_back(r);
+        vc.push_back(g);
+        vc.push_back(b);
+      }
+
+      continue;
+    }
+
+    // normal
+    if (token[0] == 'v' && token[1] == 'n' && IS_SPACE((token[2]))) {
+      token += 3;
+      real_t x, y, z;
+      parseReal3(&x, &y, &z, &token);
+      vn.push_back(x);
+      vn.push_back(y);
+      vn.push_back(z);
+      continue;
+    }
+
+    // texcoord
+    if (token[0] == 'v' && token[1] == 't' && IS_SPACE((token[2]))) {
+      token += 3;
+      real_t x, y;
+      parseReal2(&x, &y, &token);
+      vt.push_back(x);
+      vt.push_back(y);
+      continue;
+    }
+
+    // skin weight. tinyobj extension
+    if (token[0] == 'v' && token[1] == 'w' && IS_SPACE((token[2]))) {
+      token += 3;
+
+      // vw <vid> <joint_0> <weight_0> <joint_1> <weight_1> ...
+      // example:
+      // vw 0 0 0.25 1 0.25 2 0.5
+
+      // TODO(syoyo): Add syntax check
+      int vid = 0;
+      vid = parseInt(&token);
+
+      skin_weight_t sw;
+
+      sw.vertex_id = vid;
+
+      while (!IS_NEW_LINE(token[0])) {
+        real_t j, w;
+        // joint_id should not be negative, weight may be negative
+        // TODO(syoyo): # of elements check
+        parseReal2(&j, &w, &token, -1.0);
+
+        if (j < static_cast<real_t>(0)) {
+          if (err) {
+            std::stringstream ss;
+            ss << "Failed parse `vw' line. joint_id is negative. "
+                  "line "
+               << line_num << ".)\n";
+            (*err) += ss.str();
+          }
+          return false;
+        }
+
+        joint_and_weight_t jw;
+
+        jw.joint_id = int(j);
+        jw.weight = w;
+
+        sw.weightValues.push_back(jw);
+
+        size_t n = strspn(token, " \t\r");
+        token += n;
+      }
+
+      vw.push_back(sw);
+    }
+
+    // line
+    if (token[0] == 'l' && IS_SPACE((token[1]))) {
+      token += 2;
+
+      __line_t line;
+
+      while (!IS_NEW_LINE(token[0])) {
+        vertex_index_t vi;
+        if (!parseTriple(&token, static_cast<int>(v.size() / 3),
+                         static_cast<int>(vn.size() / 3),
+                         static_cast<int>(vt.size() / 2), &vi)) {
+          if (err) {
+            std::stringstream ss;
+            ss << "Failed parse `l' line(e.g. zero value for vertex index. "
+                  "line "
+               << line_num << ".)\n";
+            (*err) += ss.str();
+          }
+          return false;
+        }
+
+        line.vertex_indices.push_back(vi);
+
+        size_t n = strspn(token, " \t\r");
+        token += n;
+      }
+
+      prim_group.lineGroup.push_back(line);
+
+      continue;
+    }
+
+    // points
+    if (token[0] == 'p' && IS_SPACE((token[1]))) {
+      token += 2;
+
+      __points_t pts;
+
+      while (!IS_NEW_LINE(token[0])) {
+        vertex_index_t vi;
+        if (!parseTriple(&token, static_cast<int>(v.size() / 3),
+                         static_cast<int>(vn.size() / 3),
+                         static_cast<int>(vt.size() / 2), &vi)) {
+          if (err) {
+            std::stringstream ss;
+            ss << "Failed parse `p' line(e.g. zero value for vertex index. "
+                  "line "
+               << line_num << ".)\n";
+            (*err) += ss.str();
+          }
+          return false;
+        }
+
+        pts.vertex_indices.push_back(vi);
+
+        size_t n = strspn(token, " \t\r");
+        token += n;
+      }
+
+      prim_group.pointsGroup.push_back(pts);
+
+      continue;
+    }
+
+    // face
+    if (token[0] == 'f' && IS_SPACE((token[1]))) {
+      token += 2;
+      token += strspn(token, " \t");
+
+      face_t face;
+
+      face.smoothing_group_id = current_smoothing_id;
+      face.vertex_indices.reserve(3);
+
+      while (!IS_NEW_LINE(token[0])) {
+        vertex_index_t vi;
+        if (!parseTriple(&token, static_cast<int>(v.size() / 3),
+                         static_cast<int>(vn.size() / 3),
+                         static_cast<int>(vt.size() / 2), &vi)) {
+          if (err) {
+            std::stringstream ss;
+            ss << "Failed parse `f' line(e.g. zero value for face index. line "
+               << line_num << ".)\n";
+            (*err) += ss.str();
+          }
+          return false;
+        }
+
+        greatest_v_idx = greatest_v_idx > vi.v_idx ? greatest_v_idx : vi.v_idx;
+        greatest_vn_idx =
+            greatest_vn_idx > vi.vn_idx ? greatest_vn_idx : vi.vn_idx;
+        greatest_vt_idx =
+            greatest_vt_idx > vi.vt_idx ? greatest_vt_idx : vi.vt_idx;
+
+        face.vertex_indices.push_back(vi);
+        size_t n = strspn(token, " \t\r");
+        token += n;
+      }
+
+      // replace with emplace_back + std::move on C++11
+      prim_group.faceGroup.push_back(face);
+
+      continue;
+    }
+
+    // use mtl
+    if ((0 == strncmp(token, "usemtl", 6))) {
+      token += 6;
+      std::string namebuf = parseString(&token);
+
+      int newMaterialId = -1;
+      std::map<std::string, int>::const_iterator it =
+          material_map.find(namebuf);
+      if (it != material_map.end()) {
+        newMaterialId = it->second;
+      } else {
+        // { error!! material not found }
+        if (warn) {
+          (*warn) += "material [ '" + namebuf + "' ] not found in .mtl\n";
+        }
+      }
+
+      if (newMaterialId != material) {
+        // Create per-face material. Thus we don't add `shape` to `shapes` at
+        // this time.
+        // just clear `faceGroup` after `exportGroupsToShape()` call.
+        exportGroupsToShape(&shape, prim_group, tags, material, name,
+                            triangulate, v, warn);
+        prim_group.faceGroup.clear();
+        material = newMaterialId;
+      }
+
+      continue;
+    }
+
+    // load mtl
+    if ((0 == strncmp(token, "mtllib", 6)) && IS_SPACE((token[6]))) {
+      if (readMatFn) {
+        token += 7;
+
+        std::vector<std::string> filenames;
+        SplitString(std::string(token), ' ', '\\', filenames);
+
+        if (filenames.empty()) {
+          if (warn) {
+            std::stringstream ss;
+            ss << "Looks like empty filename for mtllib. Use default "
+                  "material (line "
+               << line_num << ".)\n";
+
+            (*warn) += ss.str();
+          }
+        } else {
+          bool found = false;
+          for (size_t s = 0; s < filenames.size(); s++) {
+            if (material_filenames.count(filenames[s]) > 0) {
+              found = true;
+              continue;
+            }
+
+            std::string warn_mtl;
+            std::string err_mtl;
+            bool ok = (*readMatFn)(filenames[s].c_str(), materials,
+                                   &material_map, &warn_mtl, &err_mtl);
+            if (warn && (!warn_mtl.empty())) {
+              (*warn) += warn_mtl;
+            }
+
+            if (err && (!err_mtl.empty())) {
+              (*err) += err_mtl;
+            }
+
+            if (ok) {
+              found = true;
+              material_filenames.insert(filenames[s]);
+              break;
+            }
+          }
+
+          if (!found) {
+            if (warn) {
+              (*warn) +=
+                  "Failed to load material file(s). Use default "
+                  "material.\n";
+            }
+          }
+        }
+      }
+
+      continue;
+    }
+
+    // group name
+    if (token[0] == 'g' && IS_SPACE((token[1]))) {
+      // flush previous face group.
+      bool ret = exportGroupsToShape(&shape, prim_group, tags, material, name,
+                                     triangulate, v, warn);
+      (void)ret;  // return value not used.
+
+      if (shape.mesh.indices.size() > 0) {
+        shapes->push_back(shape);
+      }
+
+      shape = shape_t();
+
+      // material = -1;
+      prim_group.clear();
+
+      std::vector<std::string> names;
+
+      while (!IS_NEW_LINE(token[0])) {
+        std::string str = parseString(&token);
+        names.push_back(str);
+        token += strspn(token, " \t\r");  // skip tag
+      }
+
+      // names[0] must be 'g'
+
+      if (names.size() < 2) {
+        // 'g' with empty names
+        if (warn) {
+          std::stringstream ss;
+          ss << "Empty group name. line: " << line_num << "\n";
+          (*warn) += ss.str();
+          name = "";
+        }
+      } else {
+        std::stringstream ss;
+        ss << names[1];
+
+        // tinyobjloader does not support multiple groups for a primitive.
+        // Currently we concatinate multiple group names with a space to get
+        // single group name.
+
+        for (size_t i = 2; i < names.size(); i++) {
+          ss << " " << names[i];
+        }
+
+        name = ss.str();
+      }
+
+      continue;
+    }
+
+    // object name
+    if (token[0] == 'o' && IS_SPACE((token[1]))) {
+      // flush previous face group.
+      bool ret = exportGroupsToShape(&shape, prim_group, tags, material, name,
+                                     triangulate, v, warn);
+      (void)ret;  // return value not used.
+
+      if (shape.mesh.indices.size() > 0 || shape.lines.indices.size() > 0 ||
+          shape.points.indices.size() > 0) {
+        shapes->push_back(shape);
+      }
+
+      // material = -1;
+      prim_group.clear();
+      shape = shape_t();
+
+      // @todo { multiple object name? }
+      token += 2;
+      std::stringstream ss;
+      ss << token;
+      name = ss.str();
+
+      continue;
+    }
+
+    if (token[0] == 't' && IS_SPACE(token[1])) {
+      const int max_tag_nums = 8192;  // FIXME(syoyo): Parameterize.
+      tag_t tag;
+
+      token += 2;
+
+      tag.name = parseString(&token);
+
+      tag_sizes ts = parseTagTriple(&token);
+
+      if (ts.num_ints < 0) {
+        ts.num_ints = 0;
+      }
+      if (ts.num_ints > max_tag_nums) {
+        ts.num_ints = max_tag_nums;
+      }
+
+      if (ts.num_reals < 0) {
+        ts.num_reals = 0;
+      }
+      if (ts.num_reals > max_tag_nums) {
+        ts.num_reals = max_tag_nums;
+      }
+
+      if (ts.num_strings < 0) {
+        ts.num_strings = 0;
+      }
+      if (ts.num_strings > max_tag_nums) {
+        ts.num_strings = max_tag_nums;
+      }
+
+      tag.intValues.resize(static_cast<size_t>(ts.num_ints));
+
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_ints); ++i) {
+        tag.intValues[i] = parseInt(&token);
+      }
+
+      tag.floatValues.resize(static_cast<size_t>(ts.num_reals));
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_reals); ++i) {
+        tag.floatValues[i] = parseReal(&token);
+      }
+
+      tag.stringValues.resize(static_cast<size_t>(ts.num_strings));
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_strings); ++i) {
+        tag.stringValues[i] = parseString(&token);
+      }
+
+      tags.push_back(tag);
+
+      continue;
+    }
+
+    if (token[0] == 's' && IS_SPACE(token[1])) {
+      // smoothing group id
+      token += 2;
+
+      // skip space.
+      token += strspn(token, " \t");  // skip space
+
+      if (token[0] == '\0') {
+        continue;
+      }
+
+      if (token[0] == '\r' || token[1] == '\n') {
+        continue;
+      }
+
+      if (strlen(token) >= 3 && token[0] == 'o' && token[1] == 'f' &&
+          token[2] == 'f') {
+        current_smoothing_id = 0;
+      } else {
+        // assume number
+        int smGroupId = parseInt(&token);
+        if (smGroupId < 0) {
+          // parse error. force set to 0.
+          // FIXME(syoyo): Report warning.
+          current_smoothing_id = 0;
+        } else {
+          current_smoothing_id = static_cast<unsigned int>(smGroupId);
+        }
+      }
+
+      continue;
+    }  // smoothing group id
+
+    // Ignore unknown command.
+  }
+
+  // not all vertices have colors, no default colors desired? -> clear colors
+  if (!found_all_colors && !default_vcols_fallback) {
+    vc.clear();
+  }
+
+  if (greatest_v_idx >= static_cast<int>(v.size() / 3)) {
+    if (warn) {
+      std::stringstream ss;
+      ss << "Vertex indices out of bounds (line " << line_num << ".)\n\n";
+      (*warn) += ss.str();
+    }
+  }
+  if (greatest_vn_idx >= static_cast<int>(vn.size() / 3)) {
+    if (warn) {
+      std::stringstream ss;
+      ss << "Vertex normal indices out of bounds (line " << line_num << ".)\n\n";
+      (*warn) += ss.str();
+    }
+  }
+  if (greatest_vt_idx >= static_cast<int>(vt.size() / 2)) {
+    if (warn) {
+      std::stringstream ss;
+      ss << "Vertex texcoord indices out of bounds (line " << line_num << ".)\n\n";
+      (*warn) += ss.str();
+    }
+  }
+
+  bool ret = exportGroupsToShape(&shape, prim_group, tags, material, name,
+                                 triangulate, v, warn);
+  // exportGroupsToShape return false when `usemtl` is called in the last
+  // line.
+  // we also add `shape` to `shapes` when `shape.mesh` has already some
+  // faces(indices)
+  if (ret || shape.mesh.indices
+                 .size()) {  // FIXME(syoyo): Support other prims(e.g. lines)
+    shapes->push_back(shape);
+  }
+  prim_group.clear();  // for safety
+
+  if (err) {
+    (*err) += errss.str();
+  }
+
+  attrib->vertices.swap(v);
+  attrib->vertex_weights.swap(v);
+  attrib->normals.swap(vn);
+  attrib->texcoords.swap(vt);
+  attrib->texcoord_ws.swap(vt);
+  attrib->colors.swap(vc);
+  attrib->skin_weights.swap(vw);
+
+  return true;
+}
+
+bool LoadObjWithCallback(std::istream &inStream, const callback_t &callback,
+                         void *user_data /*= NULL*/,
+                         MaterialReader *readMatFn /*= NULL*/,
+                         std::string *warn, /* = NULL*/
+                         std::string *err /*= NULL*/) {
+  std::stringstream errss;
+
+  // material
+  std::set<std::string> material_filenames;
+  std::map<std::string, int> material_map;
+  int material_id = -1;  // -1 = invalid
+
+  std::vector<index_t> indices;
+  std::vector<material_t> materials;
+  std::vector<std::string> names;
+  names.reserve(2);
+  std::vector<const char *> names_out;
+
+  std::string linebuf;
+  while (inStream.peek() != -1) {
+    safeGetline(inStream, linebuf);
+
+    // Trim newline '\r\n' or '\n'
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\n')
+        linebuf.erase(linebuf.size() - 1);
+    }
+    if (linebuf.size() > 0) {
+      if (linebuf[linebuf.size() - 1] == '\r')
+        linebuf.erase(linebuf.size() - 1);
+    }
+
+    // Skip if empty line.
+    if (linebuf.empty()) {
+      continue;
+    }
+
+    // Skip leading space.
+    const char *token = linebuf.c_str();
+    token += strspn(token, " \t");
+
+    assert(token);
+    if (token[0] == '\0') continue;  // empty line
+
+    if (token[0] == '#') continue;  // comment line
+
+    // vertex
+    if (token[0] == 'v' && IS_SPACE((token[1]))) {
+      token += 2;
+      // TODO(syoyo): Support parsing vertex color extension.
+      real_t x, y, z, w;  // w is optional. default = 1.0
+      parseV(&x, &y, &z, &w, &token);
+      if (callback.vertex_cb) {
+        callback.vertex_cb(user_data, x, y, z, w);
+      }
+      continue;
+    }
+
+    // normal
+    if (token[0] == 'v' && token[1] == 'n' && IS_SPACE((token[2]))) {
+      token += 3;
+      real_t x, y, z;
+      parseReal3(&x, &y, &z, &token);
+      if (callback.normal_cb) {
+        callback.normal_cb(user_data, x, y, z);
+      }
+      continue;
+    }
+
+    // texcoord
+    if (token[0] == 'v' && token[1] == 't' && IS_SPACE((token[2]))) {
+      token += 3;
+      real_t x, y, z;  // y and z are optional. default = 0.0
+      parseReal3(&x, &y, &z, &token);
+      if (callback.texcoord_cb) {
+        callback.texcoord_cb(user_data, x, y, z);
+      }
+      continue;
+    }
+
+    // face
+    if (token[0] == 'f' && IS_SPACE((token[1]))) {
+      token += 2;
+      token += strspn(token, " \t");
+
+      indices.clear();
+      while (!IS_NEW_LINE(token[0])) {
+        vertex_index_t vi = parseRawTriple(&token);
+
+        index_t idx;
+        idx.vertex_index = vi.v_idx;
+        idx.normal_index = vi.vn_idx;
+        idx.texcoord_index = vi.vt_idx;
+
+        indices.push_back(idx);
+        size_t n = strspn(token, " \t\r");
+        token += n;
+      }
+
+      if (callback.index_cb && indices.size() > 0) {
+        callback.index_cb(user_data, &indices.at(0),
+                          static_cast<int>(indices.size()));
+      }
+
+      continue;
+    }
+
+    // use mtl
+    if ((0 == strncmp(token, "usemtl", 6)) && IS_SPACE((token[6]))) {
+      token += 7;
+      std::stringstream ss;
+      ss << token;
+      std::string namebuf = ss.str();
+
+      int newMaterialId = -1;
+      std::map<std::string, int>::const_iterator it =
+          material_map.find(namebuf);
+      if (it != material_map.end()) {
+        newMaterialId = it->second;
+      } else {
+        // { warn!! material not found }
+        if (warn && (!callback.usemtl_cb)) {
+          (*warn) += "material [ " + namebuf + " ] not found in .mtl\n";
+        }
+      }
+
+      if (newMaterialId != material_id) {
+        material_id = newMaterialId;
+      }
+
+      if (callback.usemtl_cb) {
+        callback.usemtl_cb(user_data, namebuf.c_str(), material_id);
+      }
+
+      continue;
+    }
+
+    // load mtl
+    if ((0 == strncmp(token, "mtllib", 6)) && IS_SPACE((token[6]))) {
+      if (readMatFn) {
+        token += 7;
+
+        std::vector<std::string> filenames;
+        SplitString(std::string(token), ' ', '\\', filenames);
+
+        if (filenames.empty()) {
+          if (warn) {
+            (*warn) +=
+                "Looks like empty filename for mtllib. Use default "
+                "material. \n";
+          }
+        } else {
+          bool found = false;
+          for (size_t s = 0; s < filenames.size(); s++) {
+            if (material_filenames.count(filenames[s]) > 0) {
+              found = true;
+              continue;
+            }
+
+            std::string warn_mtl;
+            std::string err_mtl;
+            bool ok = (*readMatFn)(filenames[s].c_str(), &materials,
+                                   &material_map, &warn_mtl, &err_mtl);
+
+            if (warn && (!warn_mtl.empty())) {
+              (*warn) += warn_mtl;  // This should be warn message.
+            }
+
+            if (err && (!err_mtl.empty())) {
+              (*err) += err_mtl;
+            }
+
+            if (ok) {
+              found = true;
+              material_filenames.insert(filenames[s]);
+              break;
+            }
+          }
+
+          if (!found) {
+            if (warn) {
+              (*warn) +=
+                  "Failed to load material file(s). Use default "
+                  "material.\n";
+            }
+          } else {
+            if (callback.mtllib_cb) {
+              callback.mtllib_cb(user_data, &materials.at(0),
+                                 static_cast<int>(materials.size()));
+            }
+          }
+        }
+      }
+
+      continue;
+    }
+
+    // group name
+    if (token[0] == 'g' && IS_SPACE((token[1]))) {
+      names.clear();
+
+      while (!IS_NEW_LINE(token[0])) {
+        std::string str = parseString(&token);
+        names.push_back(str);
+        token += strspn(token, " \t\r");  // skip tag
+      }
+
+      assert(names.size() > 0);
+
+      if (callback.group_cb) {
+        if (names.size() > 1) {
+          // create const char* array.
+          names_out.resize(names.size() - 1);
+          for (size_t j = 0; j < names_out.size(); j++) {
+            names_out[j] = names[j + 1].c_str();
+          }
+          callback.group_cb(user_data, &names_out.at(0),
+                            static_cast<int>(names_out.size()));
+
+        } else {
+          callback.group_cb(user_data, NULL, 0);
+        }
+      }
+
+      continue;
+    }
+
+    // object name
+    if (token[0] == 'o' && IS_SPACE((token[1]))) {
+      // @todo { multiple object name? }
+      token += 2;
+
+      std::stringstream ss;
+      ss << token;
+      std::string object_name = ss.str();
+
+      if (callback.object_cb) {
+        callback.object_cb(user_data, object_name.c_str());
+      }
+
+      continue;
+    }
+
+#if 0  // @todo
+    if (token[0] == 't' && IS_SPACE(token[1])) {
+      tag_t tag;
+
+      token += 2;
+      std::stringstream ss;
+      ss << token;
+      tag.name = ss.str();
+
+      token += tag.name.size() + 1;
+
+      tag_sizes ts = parseTagTriple(&token);
+
+      tag.intValues.resize(static_cast<size_t>(ts.num_ints));
+
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_ints); ++i) {
+        tag.intValues[i] = atoi(token);
+        token += strcspn(token, "/ \t\r") + 1;
+      }
+
+      tag.floatValues.resize(static_cast<size_t>(ts.num_reals));
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_reals); ++i) {
+        tag.floatValues[i] = parseReal(&token);
+        token += strcspn(token, "/ \t\r") + 1;
+      }
+
+      tag.stringValues.resize(static_cast<size_t>(ts.num_strings));
+      for (size_t i = 0; i < static_cast<size_t>(ts.num_strings); ++i) {
+        std::stringstream ss;
+        ss << token;
+        tag.stringValues[i] = ss.str();
+        token += tag.stringValues[i].size() + 1;
+      }
+
+      tags.push_back(tag);
+    }
+#endif
+
+    // Ignore unknown command.
+  }
+
+  if (err) {
+    (*err) += errss.str();
+  }
+
+  return true;
+}
+
+bool ObjReader::ParseFromFile(const std::string &filename,
+                              const ObjReaderConfig &config) {
+  std::string mtl_search_path;
+
+  if (config.mtl_search_path.empty()) {
+    //
+    // split at last '/'(for unixish system) or '\\'(for windows) to get
+    // the base directory of .obj file
+    //
+    size_t pos = filename.find_last_of("/\\");
+    if (pos != std::string::npos) {
+      mtl_search_path = filename.substr(0, pos);
+    }
+  } else {
+    mtl_search_path = config.mtl_search_path;
+  }
+
+  valid_ = LoadObj(&attrib_, &shapes_, &materials_, &warning_, &error_,
+                   filename.c_str(), mtl_search_path.c_str(),
+                   config.triangulate, config.vertex_color);
+
+  return valid_;
+}
+
+bool ObjReader::ParseFromString(const std::string &obj_text,
+                                const std::string &mtl_text,
+                                const ObjReaderConfig &config) {
+  std::stringbuf obj_buf(obj_text);
+  std::stringbuf mtl_buf(mtl_text);
+
+  std::istream obj_ifs(&obj_buf);
+  std::istream mtl_ifs(&mtl_buf);
+
+  MaterialStreamReader mtl_ss(mtl_ifs);
+
+  valid_ = LoadObj(&attrib_, &shapes_, &materials_, &warning_, &error_,
+                   &obj_ifs, &mtl_ss, config.triangulate, config.vertex_color);
+
+  return valid_;
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace tinyobj
+
+#endif
diff --git a/img/anti_alias.png b/img/anti_alias.png
new file mode 100644
index 00000000..3c4aad62
Binary files /dev/null and b/img/anti_alias.png differ
diff --git a/img/diff_metalness.png b/img/diff_metalness.png
new file mode 100644
index 00000000..47a4a467
Binary files /dev/null and b/img/diff_metalness.png differ
diff --git a/img/diff_roughness.png b/img/diff_roughness.png
new file mode 100644
index 00000000..0735fd60
Binary files /dev/null and b/img/diff_roughness.png differ
diff --git a/img/direct_lighting.png b/img/direct_lighting.png
new file mode 100644
index 00000000..2b74ba0b
Binary files /dev/null and b/img/direct_lighting.png differ
diff --git a/img/final_picture.png b/img/final_picture.png
new file mode 100644
index 00000000..aaf84066
Binary files /dev/null and b/img/final_picture.png differ
diff --git a/img/no_anti_alias.png b/img/no_anti_alias.png
new file mode 100644
index 00000000..72860141
Binary files /dev/null and b/img/no_anti_alias.png differ
diff --git a/scenes/TestObj.mtl b/scenes/TestObj.mtl
new file mode 100644
index 00000000..c924254d
--- /dev/null
+++ b/scenes/TestObj.mtl
@@ -0,0 +1,22 @@
+# Blender MTL File: 'TestObj.blend'
+# Material Count: 2
+
+newmtl InnerMat
+Ns 225.000000
+Ka 1.000000 1.000000 1.000000
+Kd 0.796 0.62 0.482
+Ks 0.500000 0.500000 0.500000
+Ke 0.000000 0.000000 0.000000
+Ni 1.000000
+d 1.000000
+illum 2
+
+newmtl OuterMat
+Ns 225.000000
+Ka 1.000000 1.000000 1.000000
+Kd 0.796 0.62 0.482
+Ks 0.500000 0.500000 0.500000
+Ke 0.000000 0.000000 0.000000
+Ni 1.000000
+d 1.000000
+illum 2
diff --git a/scenes/axe.mtl b/scenes/axe.mtl
new file mode 100644
index 00000000..bd36cc33
--- /dev/null
+++ b/scenes/axe.mtl
@@ -0,0 +1,12 @@
+# Blender MTL File: 'axe.blend'
+# Material Count: 1
+
+newmtl Axe
+Ns 96.078431
+Ka 1.000000 1.000000 1.000000
+Kd 0.640000 0.640000 0.640000
+Ks 0.500000 0.500000 0.500000
+Ke 0.000000 0.000000 0.000000
+Ni 1.000000
+d 1.000000
+illum 2
diff --git a/scenes/bunny.mtl b/scenes/bunny.mtl
new file mode 100644
index 00000000..f231bdf4
--- /dev/null
+++ b/scenes/bunny.mtl
@@ -0,0 +1,10 @@
+# Blender MTL File: 'None'
+# Material Count: 1
+
+newmtl None
+Ns 500
+Ka 0.8 0.8 0.8
+Kd 0.8 0.8 0.8
+Ks 0.8 0.8 0.8
+d 1
+illum 2
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff8202..bd0e203b 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -48,6 +48,17 @@ REFR        0
 REFRIOR     0
 EMITTANCE   0
 
+// Specular and refraction white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+
 // Camera
 CAMERA
 RES         800 800
@@ -55,9 +66,11 @@ FOVY        45
 ITERATIONS  5000
 DEPTH       8
 FILE        cornell
-EYE         0.0 5 10.5
+EYE         0.0 5 7.5
 LOOKAT      0 5 0
 UP          0 1 0
+FOCAL       5	
+LENSE       2
 
 
 // Ceiling light
@@ -68,6 +81,8 @@ TRANS       0 10 0
 ROTAT       0 0 0
 SCALE       3 .3 3
 
+
+
 // Floor
 OBJECT 1
 cube
@@ -108,10 +123,16 @@ TRANS       5 5 0
 ROTAT       0 0 0
 SCALE       .01 10 10
 
-// Sphere
+// floor light
 OBJECT 6
-sphere
-material 4
-TRANS       -1 4 -1
+cube
+material 0
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .3 3 3
+
+OBJECT_obj ../scenes/bunny.obj
+TRANS       0 3 0
 ROTAT       0 0 0
-SCALE       3 3 3
+SCALE       20 20 20
+MATERIAL	6
diff --git a/scenes/cornell_box.mtl b/scenes/cornell_box.mtl
new file mode 100644
index 00000000..d3a1c7a6
--- /dev/null
+++ b/scenes/cornell_box.mtl
@@ -0,0 +1,24 @@
+newmtl white
+Ka 0 0 0
+Kd 1 1 1
+Ks 0 0 0
+
+newmtl red
+Ka 0 0 0
+Kd 1 0 0
+Ks 0 0 0
+
+newmtl green
+Ka 0 0 0
+Kd 0 1 0
+Ks 0 0 0
+
+newmtl blue
+Ka 0 0 0
+Kd 0 0 1
+Ks 0 0 0
+
+newmtl light
+Ka 20 20 20
+Kd 1 1 1
+Ks 0 0 0
diff --git a/scenes/cube.mtl b/scenes/cube.mtl
new file mode 100644
index 00000000..d3a1c7a6
--- /dev/null
+++ b/scenes/cube.mtl
@@ -0,0 +1,24 @@
+newmtl white
+Ka 0 0 0
+Kd 1 1 1
+Ks 0 0 0
+
+newmtl red
+Ka 0 0 0
+Kd 1 0 0
+Ks 0 0 0
+
+newmtl green
+Ka 0 0 0
+Kd 0 1 0
+Ks 0 0 0
+
+newmtl blue
+Ka 0 0 0
+Kd 0 0 1
+Ks 0 0 0
+
+newmtl light
+Ka 20 20 20
+Kd 1 1 1
+Ks 0 0 0
diff --git a/scenes/microfacet_test.txt b/scenes/microfacet_test.txt
new file mode 100644
index 00000000..c1786ff8
--- /dev/null
+++ b/scenes/microfacet_test.txt
@@ -0,0 +1,70 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Microfacet white
+MATERIAL 2
+RGB         0.2 0.2 0.2
+MICROFACET	1
+ROUGHNESS	0.2
+offset
+offset
+offset
+offset
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 7.5
+LOOKAT      0 5 0
+UP          0 1 0
+FOCAL       10	
+LENSE       0.3
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+
+
+// Sphere, pure diffuse
+OBJECT 1
+sphere
+material 2
+TRANS       3 2 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// OBJ
+OBJECT_obj ../scenes/bunny.obj
+material	2
+TRANS       0 3 2
+ROTAT       0 0 0
+SCALE	      0.6 0.6 0.6
+TEXTURE	../scenes/wahoo.bmp
+
diff --git a/scenes/motion.txt b/scenes/motion.txt
new file mode 100644
index 00000000..13d3ca9c
--- /dev/null
+++ b/scenes/motion.txt
@@ -0,0 +1,133 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular and refraction white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 7.5
+LOOKAT      0 5 0
+UP          0 1 0
+FOCAL       5	
+LENSE       2
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere, refrac/reflec
+OBJECT 6
+sphere
+material 1
+TRANS       2 5 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+ENDPOS      -2 4 1
diff --git a/scenes/obj_test.txt b/scenes/obj_test.txt
new file mode 100644
index 00000000..180752d6
--- /dev/null
+++ b/scenes/obj_test.txt
@@ -0,0 +1,236 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   3
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular and refraction white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// bunny 1
+MATERIAL 6
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE	0
+MICROFACET	1
+ROUGHNESS	0.2
+METALNESS	0.8	
+
+// bunny 2
+MATERIAL 7
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE	0
+MICROFACET	1
+ROUGHNESS	0.2
+METALNESS	0.6	
+
+// bunny 3
+MATERIAL 8
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE	0
+MICROFACET	1
+ROUGHNESS	0.2
+METALNESS	0.2	
+
+// Emissive material (light)
+MATERIAL 9
+RGB         1 0 0
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   3
+
+// Emissive material (light)
+MATERIAL 10
+RGB         0 1 0
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   3
+
+// Emissive material (light)
+MATERIAL 11
+RGB         0.35 0.45 0.55
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   2
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 5.5
+LOOKAT      0 5 0
+UP          0 1 0
+FOCAL       10	
+LENSE       0.3
+
+// Floor
+OBJECT 0
+cube
+material 1
+TRANS       0 0.5 0
+ROTAT       0 0 0
+SCALE       20 .01 10
+
+// Ceiling
+//OBJECT 3
+//cube
+//material 1
+//TRANS       0 10 0
+//ROTAT       0 0 90
+//SCALE       .01 20 10
+
+OBJECT 1
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 00
+SCALE       20 0.1 10
+
+// Back wall
+OBJECT 2
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 0 0
+SCALE       20 10 .1
+
+// Left wall
+OBJECT 3
+cube
+material 1
+TRANS       -10 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 4
+cube
+material 1
+TRANS       10 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+//diffuse ball
+OBJECT 5
+sphere
+material 1
+TRANS       -5 5 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+
+//mirrow
+OBJECT 6
+sphere
+material 4
+TRANS       5 5 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+
+//refrl/refra
+OBJECT 7
+sphere
+material 5
+TRANS       0 5 -3
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// lights...
+// Ceiling light 1
+OBJECT 8
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 5
+
+// Ceiling light 2
+OBJECT 9
+cube
+material 9
+TRANS       -5 10 0
+ROTAT       0 0 0
+SCALE       3 .3 5
+
+// Ceiling light 3
+OBJECT 10
+cube
+material 10
+TRANS       5 10 0
+ROTAT       0 0 0
+SCALE       3 .3 5
+
+// OBJ 1
+OBJECT_obj ../scenes/bunny.obj
+TRANS       -2 0 0
+ROTAT       0 0 0
+SCALE       8 8 8
+MATERIAL	6
+
diff --git a/scenes/sphere.txt b/scenes/sphere.txt
index a74b5458..f82d3940 100644
--- a/scenes/sphere.txt
+++ b/scenes/sphere.txt
@@ -6,7 +6,58 @@ SPECRGB     0 0 0
 REFL        0
 REFR        0
 REFRIOR     0
-EMITTANCE   5
+EMITTANCE   7
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2s
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular and refraction white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
 
 // Camera
 CAMERA
@@ -14,15 +65,69 @@ RES         800 800
 FOVY        45
 ITERATIONS  5000
 DEPTH       8
-FILE        sphere
-EYE         0.0 5 10.5
+FILE        cornell
+EYE         0.0 5 7.5
 LOOKAT      0 5 0
 UP          0 1 0
+FOCAL       5	
+LENSE       2
 
-// Sphere
+
+// Ceiling light
 OBJECT 0
-sphere
+cube
 material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+
+
+// Floor
+OBJECT 1
+cube
+material 1
 TRANS       0 0 0
 ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere, refrac/reflec
+OBJECT 6
+sphere
+material 1
+TRANS       2 5 -1
+ROTAT       0 0 0
 SCALE       3 3 3
+ENDPOS      -2 4 1
\ No newline at end of file
diff --git a/scenes/wahoo.bmp b/scenes/wahoo.bmp
new file mode 100644
index 00000000..bf1598d9
Binary files /dev/null and b/scenes/wahoo.bmp differ
diff --git a/scenes/wahoo.mtl b/scenes/wahoo.mtl
new file mode 100644
index 00000000..c89145d9
--- /dev/null
+++ b/scenes/wahoo.mtl
@@ -0,0 +1,6 @@
+newmtl initialShadingGroup
+illum 4
+Kd 0.50 0.50 0.50
+Ka 0.00 0.00 0.00
+Tf 1.00 1.00 1.00
+Ni 1.00
diff --git a/src/interactions.h b/src/interactions.h
index f969e458..2686b407 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -66,6 +66,7 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  *
  * You may need to change the parameter list for your purposes!
  */
+
 __host__ __device__
 void scatterRay(
         PathSegment & pathSegment,
@@ -76,4 +77,242 @@ void scatterRay(
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+
+    //pure diffuse
+    if (!m.hasReflective && !m.hasRefractive) {
+        auto direction = glm::normalize(calculateRandomDirectionInHemisphere(normal, rng));
+        pathSegment.ray.direction = direction;
+        pathSegment.ray.origin = intersect + 0.0001f * normal;
+        pathSegment.color *= m.color;
+    }
+    //perfect reflective
+    else if (m.hasReflective && !m.hasRefractive) {
+        glm::vec3 reflection = glm::reflect(pathSegment.ray.direction, normal);
+        pathSegment.ray.direction = reflection;
+        pathSegment.ray.origin = intersect + 0.0001f * normal;
+        pathSegment.color *= m.color;
+    }
+    //both reflection and refraction
+    else if (m.hasReflective && m.hasRefractive) {
+        glm::vec3 incident = pathSegment.ray.direction;
+        float cos_theta = glm::dot(normal,-incident );
+        float n1 = 0.f;
+        float n2 = 0.f;
+        if (cos_theta >= 0) { //vacuum to object
+            n1 = 1.f;
+            n2 = m.indexOfRefraction;
+        }
+        else {//object to vacuum
+            normal = glm::normalize(-normal); 
+            n1 = m.indexOfRefraction;
+            n2 = 1.f;
+        }
+        //schlick's approximation
+        float R0 = (n1 - n2) / (n1 + n2);
+        R0 = R0 * R0;
+        float Fresnel_term = R0 + (1 - R0) * pow(1 - cos_theta, 5);
+        thrust::uniform_real_distribution<float> u01(0, 1);
+        if (u01(rng) < Fresnel_term) {//reflection
+            glm::vec3 reflection = glm::reflect(incident, normal);
+            pathSegment.ray.direction = reflection;
+            pathSegment.ray.origin = intersect + 0.0001f * normal;
+            pathSegment.color *= m.color;
+        }
+        else {//refraction
+            glm::vec3 refraction = glm::normalize(glm::refract(incident, normal, n1 / n2));
+            pathSegment.ray.direction = refraction;
+            pathSegment.ray.origin = intersect + 0.001f * pathSegment.ray.direction;
+            pathSegment.color *= m.color;
+        }
+
+    }
+
+}
+
+__host__ __device__
+float Fresnel_Schlicks(float n1, float n2, float cos_theta) {
+    float R0 = (n1 - n2) / (n1 + n2);
+    R0 = R0 * R0;
+    float Fresnel_term = R0 + (1 - R0) * pow(1 - cos_theta, 5);
+    return Fresnel_term;
+}
+
+//helper functions for Microfacet Reflection Model
+//try this 
+__device__
+glm::vec3 Fresnel(glm::vec3 R0, float cos_theta) {
+    glm::vec3 Fresnel_term = R0 + (1.f - R0) * (float)pow(1 - cos_theta, 5);
+    return Fresnel_term;
+}
+
+
+__device__
+float G_Schlicks(float roughness, glm::vec3 normal, glm::vec3 view)
+{
+    float k = (roughness + 1) * (roughness + 1) / 8;
+    glm::vec3 n = glm::normalize(normal);
+    glm::vec3 v = glm::normalize(view);
+    return glm::dot(n, v) / (glm::dot(n, v) * (1 - k) + k);
+}
+
+__device__
+float Geometry_Smith(float roughness, glm::vec3 light, glm::vec3 view, glm::vec3 half)
+{
+    glm::vec3 v = glm::normalize(view);
+    glm::vec3 h = glm::normalize(half);    
+    glm::vec3 l = glm::normalize(light);
+    return G_Schlicks(roughness, l, h) * G_Schlicks(roughness, v, h);
+}
+
+
+
+__device__
+float D_GGX(float roughness, glm::vec3 normal, glm::vec3 half)
+{
+    float rough = roughness * roughness;
+    float rough2 = rough * rough;
+    rough2 = rough;
+    float pi = 3.1415926;
+    glm::vec3 n = glm::normalize(normal);
+    glm::vec3 h = glm::normalize(half);
+    float dot2 = glm::dot(n, h) * glm::dot(n, h);
+    return rough2 / (pi * pow(dot2 * (rough2 - 1) + 1, 2));
+}
+
+__device__
+void scatterRay(
+    PathSegment& pathSegment,
+    ShadeableIntersection& intersection,
+    const Material& m,
+    thrust::default_random_engine& rng,
+    glm::vec3 camPos) {
+    // TODO: implement this.
+    // A basic implementation of pure-diffuse shading will just call the
+    // calculateRandomDirectionInHemisphere defined above.
+
+    glm::vec3 intersect = getPointOnRay(pathSegment.ray, intersection.t);
+    glm::vec3 normal = intersection.surfaceNormal;
+    glm::vec2 uv = intersection.uv; 
+    bool hasTexture = true;
+    if (uv == glm::vec2(-1)) hasTexture = false;
+
+    //glm::vec2 act_uv = glm::vec2(m.texture_width * uv.x, m.texture_height * (1 - uv.y));
+    //int temp = act_uv.x * act_uv.y /** m.channels*/;
+    //auto data = m.img;
+    //glm::vec3 tex_color = glm::vec3(data[temp], data[temp+1], data[temp+2]);
+
+    if (m.microfacet) {
+        float metalness = m.metalness;
+        float roughness = m.roughness;
+
+        glm::vec3 eye_pos = camPos;
+        glm::vec3 worldPos = intersect;
+        auto wi = -pathSegment.ray.direction;
+        auto N = glm::normalize(normal);
+        auto wo = glm::normalize(eye_pos - worldPos);
+
+        thrust::uniform_real_distribution<float> u01(0, 1);
+        if(u01(rng) < metalness)
+            wo = glm::reflect(pathSegment.ray.direction, normal);
+        else
+            wo = glm::normalize(calculateRandomDirectionInHemisphere(N, rng));
+
+        auto H = glm::normalize((wi + wo) / 2.f);
+        auto albedo = glm::vec3(1, 1, 1); //hardcode surface color to grey;
+        auto F0 = glm::vec3(0.04f);
+        F0 = glm::mix(F0, albedo, metalness);
+        //F0 = glm::vec3(1,1,1);
+        float thetaIH = glm::dot(H, wi);
+        float thetaIN = glm::dot(N, wi);
+        glm::vec3 fresnel = Fresnel(F0, thetaIH);
+
+        //float roughness = m.roughness;
+        
+
+        float D = D_GGX(roughness, N, H);
+        float G = Geometry_Smith(roughness, wi, wo, H);
+
+        float NdotL = max(dot(N, wi), 0.0f);
+        float NdotV = max(dot(N, wo), 0.0f);
+        glm::vec3 brdf = fresnel * G * D / max(4.0f * NdotL * NdotV, 0.01f);
+
+        auto radiance = pathSegment.color;
+
+        //radiance = glm::vec3(1);
+        glm::vec3 Lo = (brdf) * radiance * NdotL;
+        glm::vec3 color = Lo;
+        //color = color / (color + glm::vec3(1.0));
+        color = pow(color, glm::vec3(1.0 / 2.2));
+
+        pathSegment.color *= color;
+
+        if (u01(rng) < metalness) {
+            glm::vec3 reflection = glm::reflect(pathSegment.ray.direction, normal);
+            pathSegment.ray.direction = reflection;
+            pathSegment.ray.origin = intersect + 0.0001f * normal;
+        }
+        else {
+            auto direction = glm::normalize(calculateRandomDirectionInHemisphere(N, rng));
+            pathSegment.ray.direction = direction;
+            pathSegment.ray.origin = intersect + 0.0001f * normal;
+
+        }
+
+
+    }
+    else {
+        glm::vec3 tex_color = glm::vec3(0.8, 0.8, 0.8);
+        //pure diffuse
+        if (!m.hasReflective && !m.hasRefractive) {
+            auto direction = glm::normalize(calculateRandomDirectionInHemisphere(normal, rng));
+            pathSegment.ray.direction = direction;
+            pathSegment.ray.origin = intersect + 0.0001f * normal;
+            if (hasTexture)
+                pathSegment.color *= tex_color;
+            else
+                pathSegment.color *= m.color;
+        }
+        //perfect reflective
+        else if (m.hasReflective && !m.hasRefractive) {
+            glm::vec3 reflection = glm::reflect(pathSegment.ray.direction, normal);
+            pathSegment.ray.direction = reflection;
+            pathSegment.ray.origin = intersect + 0.0001f * normal;
+            pathSegment.color *= m.color;
+        }
+        //both reflection and refraction
+        else if (m.hasReflective && m.hasRefractive) {
+            glm::vec3 incident = pathSegment.ray.direction;
+            float cos_theta = glm::dot(normal, -incident);
+            float n1 = 0.f;
+            float n2 = 0.f;
+            if (cos_theta >= 0) { //vacuum to object
+                n1 = 1.f;
+                n2 = m.indexOfRefraction;
+            }
+            else {//object to vacuum
+                normal = glm::normalize(-normal);
+                n1 = m.indexOfRefraction;
+                n2 = 1.f;
+            }
+            //schlick's approximation
+            /*float R0 = (n1 - n2) / (n1 + n2);
+            R0 = R0 * R0;
+            R0 + (1 - R0) * pow(1 - cos_theta, 5);*/
+            float Fresnel_term = Fresnel_Schlicks(n1, n2, cos_theta);
+            thrust::uniform_real_distribution<float> u01(0, 1);
+            if (u01(rng) < Fresnel_term) {//reflection
+                glm::vec3 reflection = glm::reflect(incident, normal);
+                pathSegment.ray.direction = reflection;
+                pathSegment.ray.origin = intersect + 0.0001f * normal;
+                pathSegment.color *= m.color;
+            }
+            else {//refraction
+                glm::vec3 refraction = glm::normalize(glm::refract(incident, normal, n1 / n2));
+                pathSegment.ray.direction = refraction;
+                pathSegment.ray.origin = intersect + 0.001f * pathSegment.ray.direction;
+                pathSegment.color *= m.color;
+            }
+
+        }
+    }
 }
diff --git a/src/intersections.h b/src/intersections.h
index b1504071..340a9285 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -5,6 +5,11 @@
 
 #include "sceneStructs.h"
 #include "utilities.h"
+#include <glm\gtc\matrix_inverse.hpp>
+#include <glm/gtc/matrix_transform.hpp>
+#include <glm/gtc/matrix_inverse.hpp>
+
+
 
 /**
  * Handy-dandy hash function that provides seeds for random number generation.
@@ -142,3 +147,143 @@ __host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
 
     return glm::length(r.origin - intersectionPoint);
 }
+
+
+
+//Moller-Trumbore Algorithm for ray triangle intersection
+__host__ __device__
+bool rayTriangleIntersectionTest(glm::vec3 &p0, glm::vec3 &p1, glm::vec3 &p2,
+    glm::vec3 &O, glm::vec3 &D, glm::vec3 &result)
+{
+    glm::vec3 E1 = p1 - p0;
+    glm::vec3 E2 = p2 - p0;
+    glm::vec3 S = O - p0;
+    glm::vec3 S1 = glm::cross(D, E2);
+    glm::vec3 S2 = glm::cross(S, E1);
+    float S2E2 = glm::dot(S2, E2);
+    float S1S = glm::dot(S1, S);
+    float S2D = glm::dot(S2, D);
+    glm::vec3 bary = 1 / (glm::dot(S1, E1)) * glm::vec3(S2E2, S1S, S2D);
+    float b1 = bary.y; float b2 = bary.z;
+    if (b1 < 0 || b2 < 0 || (1 - b1 - b2) < 0)
+        return false;
+    else {
+        result = bary;
+        return true;
+    }
+}
+
+
+__host__ __device__ 
+float triangleIntersectionTest(Geom triangle, Ray r,
+    glm::vec3& intersectionPoint, glm::vec3& normal, glm::vec2& uv, bool& outside)
+{
+
+    Ray q;
+    q.origin = multiplyMV(triangle.inverseTransform, glm::vec4(r.origin, 1.0f));
+    q.direction = glm::normalize(multiplyMV(triangle.inverseTransform, glm::vec4(r.direction, 0.0f)));
+    
+    glm::vec3 barycentric; //0: b1, 1: b2, 2: t
+    bool inter = glm::intersectRayTriangle(q.origin, q.direction, triangle.pos[0], triangle.pos[1], triangle.pos[2], barycentric);
+    
+    if (!inter) return -1;
+
+    float b1 = barycentric[0], b2 = barycentric[1], t = barycentric[2];
+    //intersectionPoint = b1 * triangle.pos[0] + b2 * triangle.pos[1] + (1 - b1 - b2) * triangle.pos[2];
+    normal = b1 * triangle.normal[0] + b2 * triangle.normal[1] + (1 - b1 - b2) * triangle.normal[2];
+    uv = b1 * triangle.uv[0] + b2 * triangle.uv[1] + (1 - b1 - b2) * triangle.uv[2];
+
+    glm::vec3 objspaceIntersection = getPointOnRay(q, t);
+
+    intersectionPoint = multiplyMV(triangle.transform, glm::vec4(objspaceIntersection, 1.f));
+    normal = glm::normalize(multiplyMV(triangle.invTranspose, glm::vec4(normal, 0.f)));
+
+    return glm::length(r.origin - intersectionPoint);
+}
+
+__host__ __device__ 
+float meshIntersectionTest(Geom mesh, Ray r, Geom* triangle, int tri_size, bool aabb,
+    glm::vec3& intersectionPoint, glm::vec3& normal, glm::vec2& uv, bool& outside)
+{
+    int start = mesh.obj_start_offset;
+    int end = start + mesh.obj_end;
+
+    if (!aabb) {
+        float t_min = FLT_MAX;
+        float temp_t = 0.0f;
+
+        glm::vec3 t_intersetion;
+        glm::vec3 t_normal;
+        glm::vec2 t_uv;
+        bool t_outside;
+
+        for (int i = start; i < end; i++) {
+            temp_t = triangleIntersectionTest(triangle[i], r, t_intersetion, t_normal, t_uv, t_outside);
+            if (temp_t != -1) {
+                t_min = temp_t;
+                break;
+            }
+        }
+
+        intersectionPoint = t_intersetion;
+        normal = t_normal;
+        uv = t_uv;
+        outside = t_outside;
+        return t_min;
+    }
+    else {
+        if (!mesh.bbox.IntersectP(r)) return -1;
+        else {
+            float t_min = FLT_MAX;
+            float temp_t = 0.0f;
+
+            glm::vec3 t_intersetion;
+            glm::vec3 t_normal;
+            glm::vec2 t_uv;
+            bool t_outside;
+            for (int i = start; i < end; i++) {
+                temp_t = triangleIntersectionTest(triangle[i], r, t_intersetion, t_normal, t_uv, t_outside);
+                //if (temp_t < t_min)
+                //    t_min = temp_t;
+                if (temp_t != -1) {
+                    t_min = temp_t;
+                    break;
+                }
+            }
+
+            intersectionPoint = t_intersetion;
+            normal = t_normal;
+            uv = t_uv;
+            outside = t_outside;
+            return t_min;
+        }
+    }
+    return -1;
+}
+
+__host__ __device__
+float triangleIntersectionTest(Tri triangle, Ray r,
+    glm::vec3& intersectionPoint, glm::vec3& normal, glm::vec2& uv, bool& outside)
+{
+
+    Ray q;
+    q.origin = multiplyMV(triangle.inverseTransform, glm::vec4(r.origin, 1.0f));
+    q.direction = glm::normalize(multiplyMV(triangle.inverseTransform, glm::vec4(r.direction, 0.0f)));
+
+    glm::vec3 barycentric; //0: b1, 1: b2, 2: t
+    bool inter = glm::intersectRayTriangle(q.origin, q.direction, triangle.p0, triangle.p1, triangle.p2, barycentric);
+
+    if (!inter) return -1;
+
+    float b1 = barycentric[0], b2 = barycentric[1], t = barycentric[2];
+    //intersectionPoint = b1 * triangle.pos[0] + b2 * triangle.pos[1] + (1 - b1 - b2) * triangle.pos[2];
+    normal = b1 * triangle.n0 + b2 * triangle.n1 + (1 - b1 - b2) * triangle.n2;
+    uv = b1 * triangle.t0 + b2 * triangle.t1 + (1 - b1 - b2) * triangle.t2;
+
+    glm::vec3 objspaceIntersection = getPointOnRay(q, t);
+
+    intersectionPoint = multiplyMV(triangle.transform, glm::vec4(objspaceIntersection, 1.f));
+    normal = glm::normalize(multiplyMV(triangle.invTranspose, glm::vec4(normal, 0.f)));
+
+    return glm::length(r.origin - intersectionPoint);
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 96127b6d..9e0ce96e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,6 +2,8 @@
 #include "preview.h"
 #include <cstring>
 
+#include <chrono>
+
 static std::string startTimeString;
 
 // For camera controls
@@ -50,6 +52,9 @@ int main(int argc, char** argv) {
 	// Set up camera stuff from loaded path tracer settings
 	iteration = 0;
 	renderState = &scene->state;
+
+	//renderState->iterations = 50;
+
 	Camera& cam = renderState->camera;
 	width = cam.resolution.x;
 	height = cam.resolution.y;
@@ -135,7 +140,9 @@ void runCuda() {
 		pathtraceInit(scene);
 	}
 
+	auto start = std::chrono::steady_clock::now();
 	if (iteration < renderState->iterations) {
+
 		uchar4* pbo_dptr = NULL;
 		iteration++;
 		cudaGLMapBufferObject((void**)&pbo_dptr, pbo);
@@ -148,6 +155,9 @@ void runCuda() {
 		cudaGLUnmapBufferObject(pbo);
 	}
 	else {
+		auto end = std::chrono::steady_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+		std::cout << "elapsed time to compute: " << elapsed_seconds.count() << "s\n";
 		saveImage();
 		pathtraceFree();
 		cudaDeviceReset();
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index fd2a4641..4a829c27 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -14,6 +14,19 @@
 #include "intersections.h"
 #include "interactions.h"
 
+#include "device_launch_parameters.h"
+#include <thrust/partition.h>
+
+#define DIRECT 0
+#define CACHE_FIRST_BOUNCE 0
+#define SORT_MATERIAL 1
+#define COMPACTION 1
+#define DEPTH_OF_FIELD 0
+#define ANTI_ALIASING 0
+#define BOUNDING_BOX 0
+
+#define MAX_INTERSECT_DIST 10000.f
+
 #define ERRORCHECK 1
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
@@ -74,8 +87,20 @@ static Geom* dev_geoms = NULL;
 static Material* dev_materials = NULL;
 static PathSegment* dev_paths = NULL;
 static ShadeableIntersection* dev_intersections = NULL;
+
+//BVH
+static BVHNode_GPU* dev_bvh_nodes = NULL;
+static Tri* dev_tris = NULL;
+
 // TODO: static variables for device memory, any extra info you need, etc
-// ...
+//for caching first bounce
+#if CACHE_FIRST_BOUNCE
+static ShadeableIntersection* dev_firstBounce = NULL;
+static PathSegment* dev_first_paths = NULL;
+#endif
+//for tiny_obj
+//static Object* dev_objects = NULL;
+static Geom* dev_tinyobj = NULL;
 
 void InitDataContainer(GuiDataContainer* imGuiData)
 {
@@ -103,6 +128,24 @@ void pathtraceInit(Scene* scene) {
 	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
 	// TODO: initialize any extra device memeory you need
+#if CACHE_FIRST_BOUNCE
+	cudaMalloc(&dev_firstBounce, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_firstBounce, 0, pixelcount * sizeof(ShadeableIntersection));
+	cudaMalloc(&dev_first_paths, pixelcount * sizeof(PathSegment));	
+#endif
+	cudaMalloc(&dev_tinyobj, scene->Obj_geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_tinyobj, scene->Obj_geoms.data(), scene->Obj_geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+
+	//BVH
+	cudaMalloc(&dev_tris, scene->num_tris * sizeof(Tri));
+	cudaMemcpy(dev_tris, scene->mesh_tris_sorted.data(), scene->num_tris * sizeof(Tri), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_bvh_nodes, scene->bvh_nodes_gpu.size() * sizeof(BVHNode_GPU));
+	cudaMemcpy(dev_bvh_nodes, scene->bvh_nodes_gpu.data(), scene->bvh_nodes_gpu.size() * sizeof(BVHNode_GPU), cudaMemcpyHostToDevice);
+
+
+
 
 	checkCUDAError("pathtraceInit");
 }
@@ -114,10 +157,41 @@ void pathtraceFree() {
 	cudaFree(dev_materials);
 	cudaFree(dev_intersections);
 	// TODO: clean up any extra device memory you created
+#if CACHE_FIRST_BOUNCE
+	cudaFree(dev_firstBounce);
+	cudaFree(dev_first_paths);
+#endif
+	cudaFree(dev_tinyobj);
+
+
+	//BVH
+	cudaFree(dev_tris);
+	cudaFree(dev_bvh_nodes);
 
 	checkCUDAError("pathtraceFree");
 }
 
+__host__ __device__
+glm::vec2 ConcentricSampleDisk(const glm::vec2 &u)
+{
+	glm::vec2 uOffset = 2.f * u - glm::vec2(1, 1);
+
+	if (uOffset.x == 0 && uOffset.y == 0)
+		return glm::vec2(0, 0);
+
+	float theta, r;
+	double pi = 3.14159265359;
+	if (std::abs(uOffset.x) > std::abs(uOffset.y)) {
+		r = uOffset.x;
+		theta = pi /4 * (uOffset.y / uOffset.x);
+	}
+	else {
+		r = uOffset.y;
+		theta = pi/2 - pi/4 * (uOffset.x / uOffset.y);
+	}
+	return r * glm::vec2(std::cos(theta), std::sin(theta));
+}
+
 /**
 * Generate PathSegments with rays from the camera through the screen into the
 * scene, which is the first bounce of rays.
@@ -131,6 +205,27 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
+
+	float jitter_x = 0.f, jitter_y = 0.f;
+#if ANTI_ALIASING
+	if (x < cam.resolution.x && y < cam.resolution.y) {
+		int index = x + (y * cam.resolution.x);
+		PathSegment& segment = pathSegments[index];
+
+		segment.ray.origin = cam.position;
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+		thrust::uniform_real_distribution<float> u(-0.5, 0.5);
+		jitter_x = u(rng);
+		jitter_y = u(rng);
+
+		// TODO: implement antialiasing by jittering the ray
+		segment.ray.direction = glm::normalize(cam.view
+			- cam.right * cam.pixelLength.x * ((float)x + jitter_x - (float)cam.resolution.x * 0.5f)
+			- cam.up * cam.pixelLength.y * ((float)y + jitter_y - (float)cam.resolution.y * 0.5f)
+		);
+#else
 	if (x < cam.resolution.x && y < cam.resolution.y) {
 		int index = x + (y * cam.resolution.x);
 		PathSegment& segment = pathSegments[index];
@@ -143,7 +238,23 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
 		);
+#endif
 
+#if DEPTH_OF_FIELD
+		//adapted from pbrt
+		if (cam.lensRadius > 0) {
+			thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+			thrust::uniform_real_distribution<float> u101(0, 1);
+			thrust::uniform_real_distribution<float> u201(0, 1);
+			glm::vec2 rand(u101(rng), u201(rng));
+			glm::vec2 pLens = cam.lensRadius * ConcentricSampleDisk(rand);
+			float ft = cam.focalDistance / -segment.ray.direction.z;
+			glm::vec3 pFocus = ft * segment.ray.direction;
+
+			segment.ray.origin += glm::vec3(pLens.x, pLens.y, 0);
+			segment.ray.direction = glm::normalize(pFocus - glm::vec3(pLens.x, pLens.y, 0));
+		}
+#endif
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
@@ -154,12 +265,15 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 // Generating new rays is handled in your shader(s).
 // Feel free to modify the code below.
 __global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment* pathSegments
-	, Geom* geoms
-	, int geoms_size
-	, ShadeableIntersection* intersections
+	int depth,
+	int num_paths,
+	PathSegment* pathSegments,
+	Geom* geoms,
+	int geoms_size,
+	Geom* triangles,
+	int triangle_size,
+	ShadeableIntersection* intersections,
+	int iter
 )
 {
 	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -171,6 +285,7 @@ __global__ void computeIntersections(
 		float t;
 		glm::vec3 intersect_point;
 		glm::vec3 normal;
+		glm::vec2 uv = glm::vec2(-1, -1);
 		float t_min = FLT_MAX;
 		int hit_geom_index = -1;
 		bool outside = true;
@@ -178,6 +293,10 @@ __global__ void computeIntersections(
 		glm::vec3 tmp_intersect;
 		glm::vec3 tmp_normal;
 
+		glm::vec2 tmp_uv = glm::vec2(-1, -1);
+
+		int materialId;
+
 		// naive parse through global geoms
 
 		for (int i = 0; i < geoms_size; i++)
@@ -191,9 +310,22 @@ __global__ void computeIntersections(
 			else if (geom.type == SPHERE)
 			{
 				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+
 			}
 			// TODO: add more intersection tests here... triangle? metaball? CSG?
+			else if (geom.type == TRIANGLE) {
+				t = triangleIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, tmp_uv, outside);
+			}
+			else if (geom.type == MESH) {
+#if BOUNDING_BOX
+				t = meshIntersectionTest(geom, pathSegment.ray, triangles, triangle_size, true,
+					tmp_intersect, tmp_normal, tmp_uv, outside);
+#else 
+				t = meshIntersectionTest(geom, pathSegment.ray, triangles, triangle_size, false,
+					tmp_intersect, tmp_normal, tmp_uv, outside);
+#endif 
 
+			}
 			// Compute the minimum t from the intersection tests to determine what
 			// scene geometry object was hit first.
 			if (t > 0.0f && t_min > t)
@@ -202,6 +334,9 @@ __global__ void computeIntersections(
 				hit_geom_index = i;
 				intersect_point = tmp_intersect;
 				normal = tmp_normal;
+
+
+				uv = tmp_uv;
 			}
 		}
 
@@ -215,10 +350,211 @@ __global__ void computeIntersections(
 			intersections[path_index].t = t_min;
 			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
 			intersections[path_index].surfaceNormal = normal;
+			intersections[path_index].uv = uv;
+
+		}
+	}
+}
+
+__global__ void computeIntersections(
+	int depth
+	, int num_paths
+	, PathSegment* pathSegments
+	, Geom* geoms
+	, int geoms_size
+	, Tri* tris
+	, int tris_size
+	, ShadeableIntersection* intersections
+	, BVHNode_GPU* bvh_nodes
+)
+{
+	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (path_index < num_paths)
+	{
+
+		Ray r = pathSegments[path_index].ray;
+
+		ShadeableIntersection isect;
+		isect.t = MAX_INTERSECT_DIST;
+
+		float t;
+		glm::vec3 tmp_intersect;
+		glm::vec2 tmp_uv = glm::vec2(-1, -1);
+		glm::vec3 tmp_normal;
+		bool outside = true;
+
+		int obj_ID = -1;
+
+		glm::vec3 intersect_point;
+		glm::vec3 normal;
+		glm::vec2 uv = glm::vec2(-1, -1);
+		float t_min = FLT_MAX;
+		int hit_geom_index = -1;
+
+		if (tris_size != 0) 
+		{
+			int stack_pointer = 0;
+			int cur_node_index = 0;
+			int node_stack[128];
+			BVHNode_GPU cur_node;
+			glm::vec3 P;
+			glm::vec3 s;
+			float t1;
+			float t2;
+			float tmin;
+			float tmax;
+			while (true) 
+			{
+				cur_node = bvh_nodes[cur_node_index];
+				auto invDir = 1.f / r.direction;
+				// (ray-aabb test node)
+				t1 = (cur_node.AABB_min.x - r.origin.x) * invDir.x;
+				t2 = (cur_node.AABB_max.x - r.origin.x) * invDir.x;
+				tmin = glm::min(t1, t2);
+				tmax = glm::max(t1, t2);
+				t1 = (cur_node.AABB_min.y - r.origin.y) * invDir.y;
+				t2 = (cur_node.AABB_max.y - r.origin.y) * invDir.y;
+				tmin = glm::max(tmin, glm::min(t1, t2));
+				tmax = glm::min(tmax, glm::max(t1, t2));
+				t1 = (cur_node.AABB_min.z - r.origin.z) * invDir.z;
+				t2 = (cur_node.AABB_max.z - r.origin.z) * invDir.z;
+				tmin = glm::max(tmin, glm::min(t1, t2));
+				tmax = glm::min(tmax, glm::max(t1, t2));
+				if (tmax >= tmin) {
+					// we intersected AABB
+					if (cur_node.tri_index != -1) {
+						// this is leaf node
+						// triangle intersection test
+						Tri tri = tris[cur_node.tri_index];
+						
+						//t = triangleIntersectionTest(tri, r, tmp_intersect, tmp_normal, tmp_uv, outside);
+
+
+						t = glm::dot(tri.plane_normal, (tri.p0 - r.origin)) / glm::dot(tri.plane_normal, r.direction);
+						if (t >= -0.0001f) {
+							P = r.origin + t * r.direction;
+							// barycentric coords
+							s = glm::vec3(glm::length(glm::cross(P - tri.p1, P - tri.p2)),
+								glm::length(glm::cross(P - tri.p2, P - tri.p0)),
+								glm::length(glm::cross(P - tri.p0, P - tri.p1))) / tri.S;
+
+							if (s.x >= -0.0001f && s.x <= 1.0001f && s.y >= -0.0001f && s.y <= 1.0001f &&
+								s.z >= -0.0001f && s.z <= 1.0001f && (s.x + s.y + s.z <= 1.0001f) && (s.x + s.y + s.z >= -0.0001f) && t_min > t) {
+								t_min = t;
+								hit_geom_index = 2;
+								normal = glm::normalize(s.x * tri.n0 + s.y * tri.n1 + s.z * tri.n2);
+							}
+						}
+
+
+						/*if (t > 0.0f && t_min > t)
+						{
+							t_min = t;
+							hit_geom_index = 0;
+							intersect_point = tmp_intersect;
+							normal = tmp_normal;
+							uv = tmp_uv;
+						}*/
+
+						// if last node in tree, we are done
+						if (stack_pointer == 0) {
+							break;
+						}
+						// otherwise need to check rest of the things in the stack
+						stack_pointer--;
+						cur_node_index = node_stack[stack_pointer];
+					}
+					else {
+						node_stack[stack_pointer] = cur_node.offset_to_second_child;
+						stack_pointer++;
+						cur_node_index++;
+					}
+				}
+				else {
+					// didn't intersect AABB, remove from stack
+					if (stack_pointer == 0) {
+						break;
+					}
+					stack_pointer--;
+					cur_node_index = node_stack[stack_pointer];
+				}
+			}
+
+
+			for (int i = 0; i < geoms_size; ++i)
+			{
+				Geom& geom = geoms[i];
+
+				if (geom.type == CUBE)
+				{
+					t = boxIntersectionTest(geom, r, tmp_intersect, tmp_normal, outside);
+				}
+				else if (geom.type == SPHERE)
+				{
+					t = sphereIntersectionTest(geom, r, tmp_intersect, tmp_normal, outside);
+
+				}
+				// TODO: add more intersection tests here... triangle? metaball? CSG?
+				//else if (geom.type == TRIANGLE) {
+				//	t = triangleIntersectionTest(geom, r, tmp_intersect, tmp_normal, tmp_uv, outside);
+				//}
+
+				if (t > 0.0f && t_min > t)
+				{
+					t_min = t;
+					hit_geom_index = i;
+					intersect_point = tmp_intersect;
+					normal = tmp_normal;
+
+
+					uv = tmp_uv;
+				}
+
+				//if (depth == 0 && glm::dot(tmp_normal, r.direction) > 0.0) {
+				//	continue;
+				//}
+				//else if (t > 0.0f && isect.t > t) {
+				//	obj_ID = i;
+				//	isect.t = t;
+				//	isect.materialId = geom.materialid;
+				//	isect.surfaceNormal = tmp_normal;
+				//}
+
+			}
+
+			if (hit_geom_index == -1)
+			{
+				intersections[path_index].t = -1.0f;
+			}
+			else
+			{
+				//The ray hits something
+				intersections[path_index].t = t_min;
+				if (hit_geom_index >= geoms_size)
+					intersections[path_index].materialId = 1;
+				else
+					intersections[path_index].materialId = geoms[hit_geom_index].materialid;
+				intersections[path_index].surfaceNormal = normal;
+				intersections[path_index].uv = uv;
+
+			}
+
+
+			//if (isect.t >= MAX_INTERSECT_DIST) {
+			//	// hits nothing
+			//	pathSegments[path_index].remainingBounces = 0;
+			//}
+			//else {
+			//	intersections[path_index] = isect;
+			//}
 		}
 	}
 }
 
+
+
+
 // LOOK: "fake" shader demonstrating what you might do with the info in
 // a ShadeableIntersection, as well as how to use thrust's random number
 // generator. Observe that since the thrust random number generator basically
@@ -273,6 +609,134 @@ __global__ void shadeFakeMaterial(
 	}
 }
 
+//__global__ void kernSimpleShade(
+//	int iter, 
+//	int num_paths, 
+//	int depth,
+//	ShadeableIntersection* shadeableIntersections, 
+//	PathSegment* pathSegments,         
+//	Material* materials)
+//{
+//	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+//	if (idx < num_paths)
+//	{
+//		ShadeableIntersection &intersection = shadeableIntersections[idx];
+//		PathSegment &ps = pathSegments[idx];
+//		if (ps.remainingBounces <= 0) return;
+//
+//		if (intersection.t > 0.0f) { // if the intersection exists...
+//			thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, depth);
+//			thrust::uniform_real_distribution<float> u01(0, 1);
+//
+//			Material &material = materials[intersection.materialId];
+//			glm::vec3 materialColor = material.color;
+//
+//			if (material.emittance > 0.0f) {
+//				ps.remainingBounces = 0;
+//				ps.color *= (materialColor * material.emittance);
+//			}
+//			
+//			else{
+//				glm::vec3 intersect = getPointOnRay(ps.ray, intersection.t);
+//				scatterRay(ps, intersection, material, rng);
+//				//scatterRay(ps, intersect, intersection.surfaceNormal, material, rng);
+//				ps.remainingBounces--;
+//			}
+//		}
+//		else {
+//			ps.remainingBounces = 0;
+//			ps.color = glm::vec3(0);
+//		}
+//	}
+//}
+
+__global__ void kernSimpleShade(
+	int iter,
+	int num_paths,
+	int depth,
+	ShadeableIntersection* shadeableIntersections,
+	PathSegment* pathSegments,
+	Material* materials,
+	glm::vec3 camPos)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < num_paths)
+	{
+		ShadeableIntersection& intersection = shadeableIntersections[idx];
+		PathSegment& ps = pathSegments[idx];
+		if (ps.remainingBounces <= 0) return;
+
+		if (intersection.t > 0.0f) { // if the intersection exists...
+			thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, depth);
+			thrust::uniform_real_distribution<float> u01(0, 1);
+
+			Material& material = materials[intersection.materialId];
+			glm::vec3 materialColor = material.color;
+
+			if (material.emittance > 0.0f) {
+				ps.remainingBounces = 0;
+				ps.color *= (materialColor * material.emittance);
+			}
+
+			else {
+				if (pathSegments->remainingBounces == 1) {
+#if DIRECT
+					//hardcode 2 lights, and randomly get 0/1
+					thrust::uniform_real_distribution<float> u02(0, 2);
+
+					thrust::uniform_real_distribution<float> u1(0, 3);
+					//thrust::uniform_real_distribution<float> u2(0, 3);
+					glm::vec3 lightPos = glm::vec3(0,10,0);
+					int whichlight = int(u02(rng));
+					if (whichlight == 0) {
+						float x = u1(rng);
+						float z = u1(rng);
+						lightPos.x = x;
+						lightPos.z = z;
+					}
+					else {
+						float y = u1(rng);
+						float z = u1(rng);
+						lightPos.x = 0;
+						lightPos.y = y;
+						lightPos.z = z;
+					}
+
+					glm::vec3 intersect = getPointOnRay(ps.ray, intersection.t);
+					glm::vec3 normal = intersection.surfaceNormal;
+					glm::vec2 uv = intersection.uv;
+					Ray r;
+					r.origin = intersect;
+					r.direction = glm::normalize(lightPos - intersect);
+					ps.ray = r;
+					ps.color *= material.color;
+					ps.remainingBounces--;
+#else
+					glm::vec3 intersect = getPointOnRay(ps.ray, intersection.t);
+					scatterRay(ps, intersection, material, rng, camPos);
+					//scatterRay(ps, intersect, intersection.surfaceNormal, material, rng);
+					ps.remainingBounces--;
+#endif 
+				}
+				else {
+					glm::vec3 intersect = getPointOnRay(ps.ray, intersection.t);
+					scatterRay(ps, intersection, material, rng, camPos);
+					//scatterRay(ps, intersect, intersection.surfaceNormal, material, rng);
+					ps.remainingBounces--;
+				}
+				//glm::vec3 intersect = getPointOnRay(ps.ray, intersection.t);
+				//scatterRay(ps, intersection, material, rng, camPos);
+				////scatterRay(ps, intersect, intersection.surfaceNormal, material, rng);
+				//ps.remainingBounces--;
+			}
+		}
+		else {
+			ps.remainingBounces = 0;
+			ps.color = glm::vec3(0);
+		}
+	}
+}
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths)
 {
@@ -285,6 +749,21 @@ __global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iteration
 	}
 }
 
+//comparators
+struct isZero {
+	__host__ __device__ 
+		bool operator()(const PathSegment &ps) {
+		return ps.remainingBounces;
+	}
+};
+
+struct compareMaterial {
+	__host__ __device__
+		bool operator()(const ShadeableIntersection &i1, const ShadeableIntersection& i2) {
+		return i1.materialId < i2.materialId;
+	}
+};
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
@@ -333,10 +812,16 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 	//   for you.
 
 	// TODO: perform one iteration of path tracing
-
+#if CACHE_FIRST_BOUNCE
+	if (iter == 1) {
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_first_paths);
+		checkCUDAError("generate camera ray");
+	}
+	cudaMemcpy(dev_paths, dev_first_paths, pixelcount * sizeof(PathSegment), cudaMemcpyDeviceToDevice);
+#else
 	generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
 	checkCUDAError("generate camera ray");
-
+#endif
 	int depth = 0;
 	PathSegment* dev_path_end = dev_paths + pixelcount;
 	int num_paths = dev_path_end - dev_paths;
@@ -349,38 +834,120 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 
 		// clean shading chunks
 		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-		// tracing
+		//create blocks
 		dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+		// tracing
+#if CACHE_FIRST_BOUNCE
+		//if first intersection in iteration 1, compute intersection to dev_firstBounce
+		//and then copy dev_firstBounce to dev_intersections
+		if (iter == 1 && depth == 0) {
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth, 
+				num_paths, 
+				dev_paths, 
+				dev_geoms, 
+				hst_scene->geoms.size(), 
+				dev_tinyobj,
+				hst_scene->Obj_geoms.size(),
+				dev_firstBounce
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+			cudaMemcpy(dev_intersections, dev_firstBounce, num_paths * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+		}
+		//if not first iteration but first bounce
+		//just copy to dev_intersections
+		else if (iter != 1 && depth == 0) {
+			cudaMemcpy(dev_intersections, dev_firstBounce, num_paths * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+		}
+		else {
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth,
+				num_paths,
+				dev_paths,
+				dev_geoms,
+				hst_scene->geoms.size(),
+				dev_tinyobj,
+				hst_scene->Obj_geoms.size(),
+				dev_intersections,
+				iter
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+		}
+#else 
+		//computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+		//	depth,
+		//	num_paths,
+		//	dev_paths,
+		//	dev_geoms,
+		//	hst_scene->geoms.size(),
+		//	dev_tinyobj,
+		//	hst_scene->Obj_geoms.size(),
+		//	dev_intersections,
+		//	iter
+		//	);
+
 		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
-			depth
+			  depth
 			, num_paths
 			, dev_paths
 			, dev_geoms
 			, hst_scene->geoms.size()
+			, dev_tris
+			, hst_scene->num_tris
 			, dev_intersections
+			, dev_bvh_nodes
 			);
 		checkCUDAError("trace one bounce");
 		cudaDeviceSynchronize();
+#endif
+
 		depth++;
 
 		// TODO:
 		// --- Shading Stage ---
 		// Shade path segments based on intersections and generate new rays by
-	  // evaluating the BSDF.
-	  // Start off with just a big kernel that handles all the different
-	  // materials you have in the scenefile.
-	  // TODO: compare between directly shading the path segments and shading
-	  // path segments that have been reshuffled to be contiguous in memory.
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+		// TODO: compare between directly shading the path segments and shading
+		// path segments that have been reshuffled to be contiguous in memory.
 
-		shadeFakeMaterial << <numblocksPathSegmentTracing, blockSize1d >> > (
+		/*shadeFakeMaterial <<<numblocksPathSegmentTracing, blockSize1d >>> (
 			iter,
 			num_paths,
 			dev_intersections,
 			dev_paths,
 			dev_materials
-			);
-		iterationComplete = true; // TODO: should be based off stream compaction results.
+		);*/
+		auto pos = cam.position;
+		kernSimpleShade << <numblocksPathSegmentTracing, blockSize1d >> > (
+			iter,
+			num_paths,
+			depth,
+			dev_intersections,
+			dev_paths,
+			dev_materials,
+			pos
+		);
+
+		//stream compaction
+#if COMPACTION
+		//referring to the first element of the second partition
+		dev_path_end = thrust::stable_partition(thrust::device, dev_paths, dev_path_end, isZero());
+		num_paths = dev_path_end - dev_paths;
+#endif
+
+#if SORT_MATERIAL
+		//sort dev_intersectoins and dev_paths based on materialId
+		thrust::stable_sort_by_key(thrust::device, dev_intersections, dev_intersections+ num_paths, dev_paths, compareMaterial());
+#endif
+
+
+		if(depth >= traceDepth || num_paths == 0)
+			iterationComplete = true; // TODO: should be based off stream compaction results.
+
 
 		if (guiData != NULL)
 		{
@@ -390,7 +957,7 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 
 	// Assemble this iteration and apply it to the image
 	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
+	finalGather << <numBlocksPixels, blockSize1d >> > (pixelcount, dev_image, dev_paths);
 
 	///////////////////////////////////////////////////////////////////////////
 
diff --git a/src/scene.cpp b/src/scene.cpp
index 3fb6239a..ddf61955 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -4,6 +4,17 @@
 #include <glm/gtc/matrix_inverse.hpp>
 #include <glm/gtx/string_cast.hpp>
 
+#define TINYOBJLOADER_IMPLEMENTATION
+#include "tiny_obj_loader.h"
+#include <stb_image.h>
+#include <stb_image_write.h>
+#include <stack>
+
+
+glm::vec3 multiplyMV(glm::mat4 m, glm::vec3 v) {
+    return glm::vec3(m * glm::vec4(v, 1.0f));
+}
+
 Scene::Scene(string filename) {
     cout << "Reading scene from " << filename << " ..." << endl;
     cout << " " << endl;
@@ -13,6 +24,7 @@ Scene::Scene(string filename) {
         cout << "Error reading from file - aborting!" << endl;
         throw;
     }
+
     while (fp_in.good()) {
         string line;
         utilityCore::safeGetline(fp_in, line);
@@ -28,8 +40,22 @@ Scene::Scene(string filename) {
                 loadCamera();
                 cout << " " << endl;
             }
+            //loading OBJ files
+            else if (strcmp(tokens[0].c_str(), "OBJECT_obj") == 0) {
+                loadObj(tokens[1].c_str());
+                //loadMesh(tokens[1].c_str());
+                cout << " " << endl;
+            }
         }
     }
+
+    if (mesh_tris.size() > 0) {
+        root_node = buildBVH(0, mesh_tris.size());
+
+        reformatBVHToGPU();
+
+        std::cout << "num nodes: " << num_nodes << std::endl;
+    }
 }
 
 int Scene::loadGeom(string objectid) {
@@ -75,6 +101,9 @@ int Scene::loadGeom(string objectid) {
             } else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
                 newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
             }
+            else if (strcmp(tokens[0].c_str(), "ENDPOS") == 0) {
+                newGeom.endPos = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+            }
 
             utilityCore::safeGetline(fp_in, line);
         }
@@ -125,6 +154,12 @@ int Scene::loadCamera() {
         } else if (strcmp(tokens[0].c_str(), "UP") == 0) {
             camera.up = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
         }
+        else if (strcmp(tokens[0].c_str(), "FOCAL") == 0) {
+            camera.focalDistance = atof(tokens[1].c_str());
+        }
+        else if (strcmp(tokens[0].c_str(), "LENSE") == 0) {
+            camera.lensRadius = atof(tokens[1].c_str());
+        }
 
         utilityCore::safeGetline(fp_in, line);
     }
@@ -164,7 +199,7 @@ int Scene::loadMaterial(string materialid) {
             string line;
             utilityCore::safeGetline(fp_in, line);
             vector<string> tokens = utilityCore::tokenizeString(line);
-            if (strcmp(tokens[0].c_str(), "RGB") == 0) {
+            if (strcmp( tokens[0].c_str(), "RGB") == 0) {
                 glm::vec3 color( atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()) );
                 newMaterial.color = color;
             } else if (strcmp(tokens[0].c_str(), "SPECEX") == 0) {
@@ -181,8 +216,673 @@ int Scene::loadMaterial(string materialid) {
             } else if (strcmp(tokens[0].c_str(), "EMITTANCE") == 0) {
                 newMaterial.emittance = atof(tokens[1].c_str());
             }
+
+            else if (strcmp(tokens[0].c_str(), "MICROFACET") == 0) {
+                newMaterial.microfacet = atof(tokens[1].c_str());
+            }
+            else if (strcmp(tokens[0].c_str(), "ROUGHNESS") == 0) {
+                newMaterial.roughness = atof(tokens[1].c_str());
+            }
+            else if (strcmp(tokens[0].c_str(), "METALNESS") == 0) {
+                newMaterial.metalness = atof(tokens[1].c_str());
+            }
         }
         materials.push_back(newMaterial);
         return 1;
     }
 }
+
+//adapted from https://github.com/tinyobjloader/tinyobjloader/blob/master/loader_example.cc
+//also https://vkguide.dev/docs/chapter-3/obj_loading/
+//int Scene::loadObj(const char* fileName)
+//{
+//    printf("loading OBJ file: %s\n", fileName);
+//    tinyobj::attrib_t attrib;
+//    std::vector<tinyobj::shape_t> shapes;
+//    std::vector<tinyobj::material_t> m_materials;
+//
+//    std::string warn;
+//    std::string err;
+//
+//    char* material_dir = "../scenes";
+//    bool ret = tinyobj::LoadObj(&attrib, &shapes, &m_materials, &warn, &err, fileName, material_dir,
+//        NULL, true);
+//    if (!warn.empty())
+//        std::cout << "WARN: " << warn << std::endl;
+//    if (!err.empty())
+//        std::cerr << "ERR: " << err << std::endl;
+//    if (!ret) {
+//        printf("Failed to load/parse .obj.\n");
+//        return false;
+//    }
+//
+//    for (size_t i = 0; i < shapes.size(); i += 1) {
+//        printf("%s\n", shapes[i].name.c_str());
+//    }
+//
+//
+//    Geom geo;
+//    string line;
+//    utilityCore::safeGetline(fp_in, line);
+//    while (!line.empty() && fp_in.good()) {
+//        vector<string> tokens = utilityCore::tokenizeString(line);
+//
+//        //load tranformations
+//        if (strcmp(tokens[0].c_str(), "TRANS") == 0) {
+//            geo.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+//        }
+//        else if (strcmp(tokens[0].c_str(), "ROTAT") == 0) {
+//            geo.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+//        }
+//        else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
+//            geo.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+//        }
+//        else if (strcmp(tokens[0].c_str(), "TEXTURE") == 0) {
+//            geo.textureName = tokens[1].c_str();
+//            loadTexture(geo, tokens[1].c_str());
+//        }
+//
+//        utilityCore::safeGetline(fp_in, line);
+//    }
+//
+//    geo.transform = utilityCore::buildTransformationMatrix(
+//        geo.translation, geo.rotation, geo.scale);
+//    geo.inverseTransform = glm::inverse(geo.transform);
+//    geo.invTranspose = glm::inverseTranspose(geo.transform);
+//
+//
+//    Obj obj;
+//    //For each shape
+//    for (size_t i = 0; i < shapes.size(); i++) {
+//        size_t index_offset = 0;
+//        // For each face
+//        for (size_t f = 0; f < shapes[i].mesh.num_face_vertices.size(); f++) {
+//            //size_t fnum = shapes[i].mesh.num_face_vertices[f]; //should always be 3
+//            size_t fnum = 3; // hardcode loading to triangles
+//
+//            geo.type = TRIANGLE;
+//            geo.materialid = materials.size();// + shapes[i].mesh.material_ids[f];
+//            // For each vertex in the face
+//            for (size_t v = 0; v < fnum; v++) {
+//                tinyobj::index_t idx = shapes[i].mesh.indices[index_offset + v];
+//                auto ver = static_cast<size_t>(idx.vertex_index);
+//                auto x = attrib.vertices[3 * ver + 0];
+//                auto y = attrib.vertices[3 * ver + 1];
+//                auto z = attrib.vertices[3 * ver + 2];
+//                geo.pos[v] = glm::vec3(x, y, z);
+//
+//
+//                if (idx.normal_index >= 0) {// -1 means no data
+//                    auto nor = static_cast<size_t>(idx.normal_index);
+//                    auto nx = attrib.normals[3 * nor + 0];
+//                    auto ny = attrib.normals[3 * nor + 1];
+//                    auto nz = attrib.normals[3 * nor + 2];
+//                    geo.normal[v] = glm::vec3(nx, ny, nz);
+//                }
+//
+//                if (idx.texcoord_index >= 0) {
+//                    auto tex = static_cast<size_t>(idx.texcoord_index);
+//                    auto tx = attrib.texcoords[2 * tex + 0];
+//                    auto ty = attrib.texcoords[2 * tex + 1];
+//                    geo.uv[v] = glm::vec2(tx, ty);
+//                }
+//
+//            }
+//            index_offset += fnum;
+//
+//            geo.isObj = true;
+//            geoms.push_back(geo);
+//
+//            Obj_geoms.push_back(geo);
+//            obj.box = Union(obj.box, AABB(geo.pos[0], geo.pos[1], geo.pos[2]));
+//        }
+//    }
+//
+//    
+//
+//    //load materials
+//    printf("material size: %d\n", m_materials.size());
+//    for (size_t i = 0; i < m_materials.size(); i++) {
+//        Material temp;
+//
+//        //temp.name = m_materials[i].name.c_str();
+//        printf("material[%ld].name = %s\n", static_cast<long>(i),
+//            m_materials[i].name.c_str());
+//
+//        temp.color = glm::vec3(
+//            static_cast<const double>(m_materials[i].diffuse[0]),
+//            static_cast<const double>(m_materials[i].diffuse[1]),
+//            static_cast<const double>(m_materials[i].diffuse[2]));
+//        //temp.color = glm::vec3(1, 1, 1);
+//
+//        printf("  material.Kd = (%f, %f ,%f)\n",
+//            static_cast<const double>(m_materials[i].diffuse[0]),
+//            static_cast<const double>(m_materials[i].diffuse[1]),
+//            static_cast<const double>(m_materials[i].diffuse[2]));
+//
+//        temp.specular.color = glm::vec3(
+//            static_cast<const double>(m_materials[i].specular[0]),
+//            static_cast<const double>(m_materials[i].specular[1]),
+//            static_cast<const double>(m_materials[i].specular[2]));
+//        temp.specular.exponent = 10; //hardcode exponent
+//        printf("  material.Ks = (%f, %f ,%f)\n",
+//            static_cast<const double>(m_materials[i].specular[0]),
+//            static_cast<const double>(m_materials[i].specular[1]),
+//            static_cast<const double>(m_materials[i].specular[2]));
+//
+//        temp.hasReflective = 1;
+//        temp.hasRefractive = 1;
+//        temp.emittance = 0;
+//        temp.indexOfRefraction = 1.5;
+//        temp.microfacet = 1;
+//        temp.roughness = 0.5;
+//
+//        //just hardcode, only work for simgle image
+//        temp.textureName = geo.textureName;
+//        temp.img = geo.img;
+//        temp.channels = geo.channels;
+//        temp.texture_width = geo.texture_width;
+//        temp.texture_height = geo.texture_height;
+//        
+//        materials.push_back(temp);
+//        OBJ_materials.push_back(temp);
+//    }
+//
+//    return 1;
+//
+//}
+
+
+//refer to https://solarianprogrammer.com/2019/06/10/c-programming-reading-writing-images-stb_image-libraries/
+glm::vec3 Scene::loadTexture(Geom& geo, const char* fileName)
+{
+    int width, height, channels;
+    geo.img = stbi_load(fileName, &width, &height, &channels, 0);
+    geo.texture_width = width;
+    geo.texture_height = height;
+    geo.channels = channels;
+    if (geo.img == NULL) {
+        printf("Error in loading the image\n");
+        exit(1);
+    }
+    printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);
+
+    
+}
+
+
+int Scene::loadMesh(const char* fileName)
+{
+    printf("loading OBJ file: %s\n", fileName);
+    tinyobj::attrib_t attrib;
+    std::vector<tinyobj::shape_t> shapes;
+    std::vector<tinyobj::material_t> m_materials;
+
+    std::string warn;
+    std::string err;
+
+    char* material_dir = "../scenes";
+    bool ret = tinyobj::LoadObj(&attrib, &shapes, &m_materials, &warn, &err, fileName, material_dir,
+        NULL, true);
+    if (!warn.empty())
+        std::cout << "WARN: " << warn << std::endl;
+    if (!err.empty())
+        std::cerr << "ERR: " << err << std::endl;
+    if (!ret) {
+        printf("Failed to load/parse .obj.\n");
+        return false;
+    }
+
+    for (size_t i = 0; i < shapes.size(); i += 1) {
+        printf("%s\n", shapes[i].name.c_str());
+    }
+
+
+    Geom geo;
+    geo.obj_start_offset = Obj_geoms.size();
+    geo.type = MESH;
+    geo.materialid = materials.size() + OBJ_materials.size();
+
+    string line;
+    utilityCore::safeGetline(fp_in, line);
+    while (!line.empty() && fp_in.good()) {
+        vector<string> tokens = utilityCore::tokenizeString(line);
+
+        //load tranformations
+        if (strcmp(tokens[0].c_str(), "TRANS") == 0) {
+            geo.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "ROTAT") == 0) {
+            geo.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
+            geo.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "TEXTURE") == 0) {
+            geo.textureName = tokens[1].c_str();
+            loadTexture(geo, tokens[1].c_str());
+        }
+        else if (strcmp(tokens[0].c_str(), "MATERIAL") == 0) {
+            geo.materialid = atoi(tokens[1].c_str());
+        }
+
+        utilityCore::safeGetline(fp_in, line);
+    }
+
+    geo.transform = utilityCore::buildTransformationMatrix(
+        geo.translation, geo.rotation, geo.scale);
+    geo.inverseTransform = glm::inverse(geo.transform);
+    geo.invTranspose = glm::inverseTranspose(geo.transform);
+
+
+    Geom triangle;
+    triangle.transform = geo.transform;
+    triangle.inverseTransform = geo.inverseTransform;
+    triangle.invTranspose = geo.invTranspose;
+    //For each shape
+    for (size_t i = 0; i < shapes.size(); i++) {
+        size_t index_offset = 0;
+        // For each face
+        for (size_t f = 0; f < shapes[i].mesh.num_face_vertices.size(); f++) {
+            //size_t fnum = shapes[i].mesh.num_face_vertices[f]; //should always be 3
+            size_t fnum = 3; // hardcode loading to triangles
+
+            triangle.type = TRIANGLE;
+            triangle.materialid = materials.size();// + shapes[i].mesh.material_ids[f];
+            // For each vertex in the face
+            for (size_t v = 0; v < fnum; v++) {
+                tinyobj::index_t idx = shapes[i].mesh.indices[index_offset + v];
+                auto ver = static_cast<size_t>(idx.vertex_index);
+                auto x = attrib.vertices[3 * ver + 0];
+                auto y = attrib.vertices[3 * ver + 1];
+                auto z = attrib.vertices[3 * ver + 2];
+                triangle.pos[v] = glm::vec3(x, y, z);
+
+                if (idx.normal_index >= 0) {// -1 means no data
+                    auto nor = static_cast<size_t>(idx.normal_index);
+                    auto nx = attrib.normals[3 * nor + 0];
+                    auto ny = attrib.normals[3 * nor + 1];
+                    auto nz = attrib.normals[3 * nor + 2];
+                    triangle.normal[v] = glm::vec3(nx, ny, nz);
+                }
+
+                if (idx.texcoord_index >= 0) {
+                    auto tex = static_cast<size_t>(idx.texcoord_index);
+                    auto tx = attrib.texcoords[2 * tex + 0];
+                    auto ty = attrib.texcoords[2 * tex + 1];
+                    triangle.uv[v] = glm::vec2(tx, ty);
+                }
+
+            }
+            index_offset += fnum;
+            //push all triangles into Obj_geoms
+            Obj_geoms.push_back(triangle);
+            geo.bbox = Union(geo.bbox, AABB(triangle.pos[0], triangle.pos[1], triangle.pos[2]));
+        }
+    }
+
+    //load materials
+    //printf("material size: %d\n", m_materials.size());
+    //for (size_t i = 0; i < m_materials.size(); i++) {
+    //    Material temp;
+    //    //temp.name = m_materials[i].name.c_str();
+    //    printf("material[%ld].name = %s\n", static_cast<long>(i),
+    //        m_materials[i].name.c_str());
+    //    temp.color = glm::vec3(
+    //        static_cast<const double>(m_materials[i].diffuse[0]),
+    //        static_cast<const double>(m_materials[i].diffuse[1]),
+    //        static_cast<const double>(m_materials[i].diffuse[2]));
+    //    //temp.color = glm::vec3(1, 1, 1);
+    //    printf("  material.Kd = (%f, %f ,%f)\n",
+    //        static_cast<const double>(m_materials[i].diffuse[0]),
+    //        static_cast<const double>(m_materials[i].diffuse[1]),
+    //        static_cast<const double>(m_materials[i].diffuse[2]));
+    //    temp.specular.color = glm::vec3(
+    //        static_cast<const double>(m_materials[i].specular[0]),
+    //        static_cast<const double>(m_materials[i].specular[1]),
+    //        static_cast<const double>(m_materials[i].specular[2]));
+    //    temp.specular.exponent = 10; //hardcode exponent
+    //    printf("  material.Ks = (%f, %f ,%f)\n",
+    //        static_cast<const double>(m_materials[i].specular[0]),
+    //        static_cast<const double>(m_materials[i].specular[1]),
+    //        static_cast<const double>(m_materials[i].specular[2]));
+    //    temp.hasReflective = 1;
+    //    temp.hasRefractive = 1;
+    //    temp.emittance = 0;
+    //    temp.indexOfRefraction = 1.5;
+    //    temp.microfacet = 1;
+    //    temp.roughness = 0.5;
+    //    //just hardcode, only work for simgle image
+    //    temp.textureName = triangle.textureName;
+    //    temp.img = triangle.img;
+    //    temp.channels = triangle.channels;
+    //    temp.texture_width = triangle.texture_width;
+    //    temp.texture_height = triangle.texture_height;
+    //    materials.push_back(temp);
+    //    OBJ_materials.push_back(temp);
+    //}
+
+    //push mesh to geoms
+    geo.obj_end = Obj_geoms.size() - geo.obj_start_offset;
+    geoms.push_back(geo);
+
+
+    return 1;
+
+}
+
+int Scene::loadObj(const char* fileName)
+{
+    printf("loading OBJ file: %s\n", fileName);
+    tinyobj::attrib_t attrib;
+    std::vector<tinyobj::shape_t> shapes;
+    std::vector<tinyobj::material_t> m_materials;
+
+    std::string warn;
+    std::string err;
+
+    char* material_dir = "../scenes";
+    bool ret = tinyobj::LoadObj(&attrib, &shapes, &m_materials, &warn, &err, fileName, material_dir,
+        NULL, true);
+    if (!warn.empty())
+        std::cout << "WARN: " << warn << std::endl;
+    if (!err.empty())
+        std::cerr << "ERR: " << err << std::endl;
+    if (!ret) {
+        printf("Failed to load/parse .obj.\n");
+        return false;
+    }
+
+    for (size_t i = 0; i < shapes.size(); i += 1) {
+        printf("%s\n", shapes[i].name.c_str());
+    }
+
+
+    Geom geo;
+    string line;
+    utilityCore::safeGetline(fp_in, line);
+    while (!line.empty() && fp_in.good()) {
+        vector<string> tokens = utilityCore::tokenizeString(line);
+
+        //load tranformations
+        if (strcmp(tokens[0].c_str(), "TRANS") == 0) {
+            geo.translation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "ROTAT") == 0) {
+            geo.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
+            geo.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        }
+        else if (strcmp(tokens[0].c_str(), "TEXTURE") == 0) {
+            geo.textureName = tokens[1].c_str();
+            loadTexture(geo, tokens[1].c_str());
+        }
+        else if (strcmp(tokens[0].c_str(), "MATERIAL") == 0) {
+            geo.materialid = atoi(tokens[1].c_str());
+        }
+
+        utilityCore::safeGetline(fp_in, line);
+    }
+
+    geo.transform = utilityCore::buildTransformationMatrix(
+        geo.translation, geo.rotation, geo.scale);
+    geo.inverseTransform = glm::inverse(geo.transform);
+    geo.invTranspose = glm::inverseTranspose(geo.transform);
+
+    //For each shape
+    for (const tinyobj::shape_t& shape : shapes) {
+        // every tri in the mesh
+        for (int i = 0; i < shape.mesh.indices.size(); i += 3) {
+            Tri newTri;
+            glm::vec3 newP = glm::vec3(0.0f);
+            glm::vec3 newN = glm::vec3(0.0f);
+            glm::vec2 newT = glm::vec2(0.0f);
+
+            newTri.transform = geo.transform;
+            newTri.inverseTransform = geo.inverseTransform;
+            newTri.invTranspose = geo.invTranspose;
+
+            for (int k = 0; k < 3; ++k) {
+
+                if (shape.mesh.indices[i + k].vertex_index != -1) {
+                    newP = glm::vec3(attrib.vertices[3 * shape.mesh.indices[i + k].vertex_index + 0],
+                        attrib.vertices[3 * shape.mesh.indices[i + k].vertex_index + 1],
+                        attrib.vertices[3 * shape.mesh.indices[i + k].vertex_index + 2]);
+                }
+
+                if (shape.mesh.indices[i + k].texcoord_index != -1) {
+                    newT = glm::vec2(
+                        attrib.texcoords[2 * shape.mesh.indices[i + k].texcoord_index + 0],
+                        1.0f - attrib.texcoords[2 * shape.mesh.indices[i + k].texcoord_index + 1]
+                    );
+                }
+
+                if (shape.mesh.indices[i + k].normal_index != -1) {
+                    newN = glm::vec3(
+                        attrib.normals[3 * shape.mesh.indices[i + k].normal_index + 0],
+                        attrib.normals[3 * shape.mesh.indices[i + k].normal_index + 1],
+                        attrib.normals[3 * shape.mesh.indices[i + k].normal_index + 2]
+                    );
+                }
+
+                if (k == 0) {
+                    newTri.p0 = newP;
+                    newTri.n0 = newN;
+                    newTri.t0 = newT;
+                }
+                else if (k == 1) {
+                    newTri.p1 = newP;
+                    newTri.n1 = newN;
+                    newTri.t1 = newT;
+                }
+                else {
+                    newTri.p2 = newP;
+                    newTri.n2 = newN;
+                    newTri.t2 = newT;
+                }
+            }
+
+            newTri.p0 = multiplyMV(newTri.transform, newTri.p0);
+            newTri.p1 = multiplyMV(newTri.transform, newTri.p1);
+            newTri.p2 = multiplyMV(newTri.transform, newTri.p2);
+            newTri.n0 = glm::normalize(multiplyMV(newTri.invTranspose, newTri.n0));
+            newTri.n1 = glm::normalize(multiplyMV(newTri.invTranspose, newTri.n1));
+            newTri.n2 = glm::normalize(multiplyMV(newTri.invTranspose, newTri.n2));
+
+            newTri.plane_normal = glm::normalize(glm::cross(newTri.p1 - newTri.p0, newTri.p2 - newTri.p1));
+            newTri.S = glm::length(glm::cross(newTri.p1 - newTri.p0, newTri.p2 - newTri.p1));
+
+
+            TriBounds newTriBounds;
+
+            newTriBounds.tri_ID = num_tris;
+
+
+            float max_x = glm::max(glm::max(newTri.p0.x, newTri.p1.x), newTri.p2.x);
+            float max_y = glm::max(glm::max(newTri.p0.y, newTri.p1.y), newTri.p2.y);
+            float max_z = glm::max(glm::max(newTri.p0.z, newTri.p1.z), newTri.p2.z);
+            newTriBounds.AABB_max = glm::vec3(max_x, max_y, max_z);
+
+            float min_x = glm::min(glm::min(newTri.p0.x, newTri.p1.x), newTri.p2.x);
+            float min_y = glm::min(glm::min(newTri.p0.y, newTri.p1.y), newTri.p2.y);
+            float min_z = glm::min(glm::min(newTri.p0.z, newTri.p1.z), newTri.p2.z);
+            newTriBounds.AABB_min = glm::vec3(min_x, min_y, min_z);
+
+            float mid_x = (newTri.p0.x + newTri.p1.x + newTri.p2.x) / 3.0;
+            float mid_y = (newTri.p0.y + newTri.p1.y + newTri.p2.y) / 3.0;
+            float mid_z = (newTri.p0.z + newTri.p1.z + newTri.p2.z) / 3.0;
+            newTriBounds.AABB_centroid = glm::vec3(mid_x, mid_y, mid_z);
+
+            tri_bounds.push_back(newTriBounds);
+
+            mesh_tris.push_back(newTri);
+            num_tris++;
+        }
+    }
+    return 1;
+}
+
+BVHNode* Scene::buildBVH(int start_index, int end_index) {
+    BVHNode* new_node = new BVHNode();
+    num_nodes++;
+    int num_tris_in_node = end_index - start_index;
+
+    // get the AABB bounds for this node (getting min and max of all triangles within)
+    glm::vec3 max_bounds = glm::vec3(-100000.0);
+    glm::vec3 min_bounds = glm::vec3(100000.0);
+    for (int i = start_index; i < end_index; ++i) {
+        if (max_bounds.x < tri_bounds[i].AABB_max.x) {
+            max_bounds.x = tri_bounds[i].AABB_max.x;
+        }
+        if (max_bounds.y < tri_bounds[i].AABB_max.y) {
+            max_bounds.y = tri_bounds[i].AABB_max.y;
+        }
+        if (max_bounds.z < tri_bounds[i].AABB_max.z) {
+            max_bounds.z = tri_bounds[i].AABB_max.z;
+        }
+
+        if (min_bounds.x > tri_bounds[i].AABB_min.x) {
+            min_bounds.x = tri_bounds[i].AABB_min.x;
+        }
+        if (min_bounds.y > tri_bounds[i].AABB_min.y) {
+            min_bounds.y = tri_bounds[i].AABB_min.y;
+        }
+        if (min_bounds.z > tri_bounds[i].AABB_min.z) {
+            min_bounds.z = tri_bounds[i].AABB_min.z;
+        }
+    }
+
+    // leaf node (with 1 tri in it)
+    if (num_tris_in_node <= 1) {
+        mesh_tris_sorted.push_back(mesh_tris[tri_bounds[start_index].tri_ID]);
+        new_node->tri_index = mesh_tris_sorted.size() - 1;
+        new_node->AABB_max = max_bounds;
+        new_node->AABB_min = min_bounds;
+        return new_node;
+    }
+    // intermediate node (covering tris start_index through end_index
+    else {
+        // get the greatest length between tri centroids in each direction x, y, and z
+        glm::vec3 centroid_max = glm::vec3(-100000.0);
+        glm::vec3 centroid_min = glm::vec3(100000.0);
+        for (int i = start_index; i < end_index; ++i) {
+            if (centroid_max.x < tri_bounds[i].AABB_centroid.x) {
+                centroid_max.x = tri_bounds[i].AABB_centroid.x;
+            }
+            if (centroid_max.y < tri_bounds[i].AABB_centroid.y) {
+                centroid_max.y = tri_bounds[i].AABB_centroid.y;
+            }
+            if (centroid_max.z < tri_bounds[i].AABB_centroid.z) {
+                centroid_max.z = tri_bounds[i].AABB_centroid.z;
+            }
+
+            if (centroid_min.x > tri_bounds[i].AABB_centroid.x) {
+                centroid_min.x = tri_bounds[i].AABB_centroid.x;
+            }
+            if (centroid_min.y > tri_bounds[i].AABB_centroid.y) {
+                centroid_min.y = tri_bounds[i].AABB_centroid.y;
+            }
+            if (centroid_min.z > tri_bounds[i].AABB_centroid.z) {
+                centroid_min.z = tri_bounds[i].AABB_centroid.z;
+            }
+        }
+        glm::vec3 centroid_extent = centroid_max - centroid_min;
+
+        // choose dimension to split along (dimension with largest extent)
+        int dimension_to_split = 0;
+        if (centroid_extent.x >= centroid_extent.y && centroid_extent.x >= centroid_extent.z) {
+            dimension_to_split = 0;
+        }
+        else if (centroid_extent.y >= centroid_extent.x && centroid_extent.y >= centroid_extent.z) {
+            dimension_to_split = 1;
+        }
+        else {
+            dimension_to_split = 2;
+        }
+
+
+        int mid_point = (start_index + end_index) / 2;
+        float centroid_midpoint = (centroid_min[dimension_to_split] + centroid_max[dimension_to_split]) / 2;
+
+        if (centroid_min[dimension_to_split] == centroid_max[dimension_to_split]) {
+            mesh_tris_sorted.push_back(mesh_tris[tri_bounds[start_index].tri_ID]);
+            new_node->tri_index = mesh_tris_sorted.size() - 1;
+            new_node->AABB_max = max_bounds;
+            new_node->AABB_min = min_bounds;
+            return new_node;
+        }
+
+        // partition triangles in bounding box, ones with centroids less than the midpoint go before ones with greater than
+        // using std::partition for partition algorithm
+        // https://en.cppreference.com/w/cpp/algorithm/partition
+        TriBounds* pointer_to_partition_point = std::partition(&tri_bounds[start_index], &tri_bounds[end_index - 1] + 1,
+            [dimension_to_split, centroid_midpoint](const TriBounds& triangle_AABB) {
+                return triangle_AABB.AABB_centroid[dimension_to_split] < centroid_midpoint;
+            });
+
+        // get the pointer relative to the start of the array
+        mid_point = pointer_to_partition_point - &tri_bounds[0];
+
+        // create two children nodes each for one side of the partitioned node
+        new_node->child_nodes[0] = buildBVH(start_index, mid_point);
+        new_node->child_nodes[1] = buildBVH(mid_point, end_index);
+
+        new_node->split_axis = dimension_to_split;
+        new_node->tri_index = -1;
+
+        new_node->AABB_max.x = glm::max(new_node->child_nodes[0]->AABB_max.x, new_node->child_nodes[1]->AABB_max.x);
+        new_node->AABB_max.y = glm::max(new_node->child_nodes[0]->AABB_max.y, new_node->child_nodes[1]->AABB_max.y);
+        new_node->AABB_max.z = glm::max(new_node->child_nodes[0]->AABB_max.z, new_node->child_nodes[1]->AABB_max.z);
+
+        new_node->AABB_min.x = glm::min(new_node->child_nodes[0]->AABB_min.x, new_node->child_nodes[1]->AABB_min.x);
+        new_node->AABB_min.y = glm::min(new_node->child_nodes[0]->AABB_min.y, new_node->child_nodes[1]->AABB_min.y);
+        new_node->AABB_min.z = glm::min(new_node->child_nodes[0]->AABB_min.z, new_node->child_nodes[1]->AABB_min.z);
+        return new_node;
+    }
+}
+
+void Scene::reformatBVHToGPU() {
+    BVHNode* cur_node;
+    std::stack<BVHNode*> nodes_to_process;
+    std::stack<int> index_to_parent;
+    std::stack<bool> second_child_query;
+    int cur_node_index = 0;
+    int parent_index = 0;
+    bool is_second_child = false;
+    nodes_to_process.push(root_node);
+    index_to_parent.push(-1);
+    second_child_query.push(false);
+    while (!nodes_to_process.empty()) {
+        BVHNode_GPU new_gpu_node;
+
+        cur_node = nodes_to_process.top();
+        nodes_to_process.pop();
+        parent_index = index_to_parent.top();
+        index_to_parent.pop();
+        is_second_child = second_child_query.top();
+        second_child_query.pop();
+
+        if (is_second_child && parent_index != -1) {
+            bvh_nodes_gpu[parent_index].offset_to_second_child = bvh_nodes_gpu.size();
+        }
+        new_gpu_node.AABB_min = cur_node->AABB_min;
+        new_gpu_node.AABB_max = cur_node->AABB_max;
+        if (cur_node->tri_index != -1) {
+            // leaf node
+            new_gpu_node.tri_index = cur_node->tri_index;
+        }
+        else {
+            // intermediate node
+            new_gpu_node.axis = cur_node->split_axis;
+            new_gpu_node.tri_index = -1;
+            nodes_to_process.push(cur_node->child_nodes[1]);
+            index_to_parent.push(bvh_nodes_gpu.size());
+            second_child_query.push(true);
+            nodes_to_process.push(cur_node->child_nodes[0]);
+            index_to_parent.push(-1);
+            second_child_query.push(false);
+        }
+        bvh_nodes_gpu.push_back(new_gpu_node);
+    }
+}
\ No newline at end of file
diff --git a/src/scene.h b/src/scene.h
index f29a9171..8884cadd 100644
--- a/src/scene.h
+++ b/src/scene.h
@@ -16,11 +16,34 @@ class Scene {
     int loadMaterial(string materialid);
     int loadGeom(string objectid);
     int loadCamera();
+    int loadObj(const char* fileName);
+    int loadMesh(const char* fileName);
+    glm::vec3 loadTexture(Geom &geo, const char* fileName);
 public:
     Scene(string filename);
     ~Scene();
 
     std::vector<Geom> geoms;
     std::vector<Material> materials;
+
+    //std::vector<Material> m_materials;
+
+    std::vector<Material> OBJ_materials;
+
     RenderState state;
+
+    std::vector<Geom> Obj_geoms;
+    
+    BVHNode* buildBVH(int start_index, int end_index);
+    void reformatBVHToGPU();
+
+    int num_tris = 0;
+    int num_geoms = 0;
+    std::vector<Tri> mesh_tris;
+    std::vector<Tri> mesh_tris_sorted;
+    BVHNode* root_node;
+    int num_nodes = 0;
+    std::vector<BVHNode_GPU> bvh_nodes_gpu;
+    std::vector<TriBounds> tri_bounds;
+
 };
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da4dbf30..a2f3e59f 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -4,12 +4,15 @@
 #include <vector>
 #include <cuda_runtime.h>
 #include "glm/glm.hpp"
+#include <array>
 
 #define BACKGROUND_COLOR (glm::vec3(0.0f))
 
 enum GeomType {
     SPHERE,
     CUBE,
+    MESH,
+    TRIANGLE
 };
 
 struct Ray {
@@ -17,6 +20,51 @@ struct Ray {
     glm::vec3 direction;
 };
 
+struct AABB {
+    glm::vec3 pMin, pMax;
+
+    AABB()
+    {
+        float minNum = FLT_MIN;
+        float maxNum = FLT_MAX;
+        pMin = glm::vec3(minNum);
+        pMax = glm::vec3(maxNum);
+    }
+
+    AABB(glm::vec3 p1, glm::vec3 p2, glm::vec3 p3) {
+        pMin = glm::vec3(fmin(p1.x, p2.x), fmin(p1.y, p2.y), fmin(p1.z, p2.z));
+        pMax = glm::vec3(fmax(p1.x, p2.x), fmax(p1.y, p2.y), fmax(p1.z, p2.z));
+
+        pMin = glm::vec3(fmin(pMin.x, p3.x), fmin(pMin.y, p3.y), fmin(pMin.z, p3.z));
+        pMax = glm::vec3(fmax(pMax.x, p3.x), fmax(pMax.y, p3.y), fmax(pMax.z, p3.z));
+
+    }
+
+    bool IntersectP(const Ray& ray) const
+    {
+
+        //auto temp1 = glm::vec3(pMax.x - ray.origin.x, pMax.y - ray.origin.y, pMax.z - ray.origin.z);
+        //auto temp2 = glm::vec3(pMin.x - ray.origin.x, pMin.y - ray.origin.y, pMin.z - ray.origin.z);
+        /*glm::vec3 ttop = glm::vec3((float)temp1.x * (float)invDir.x, (float)temp1.y * (float)invDir.y, (float)temp1.z * (float)invDir.z);
+        glm::vec3 tbot = glm::vec3((float)temp2.x * (float)invDir.x, (float)temp2.y * (float)invDir.y, (float)temp2.z * (float)invDir.z);*/
+
+        glm::vec3 invDir = glm::vec3(1 / ray.direction.x, 1 / ray.direction.y, 1 / ray.direction.z);
+
+        auto temp1 = pMax - ray.origin;
+        auto temp2 = pMin - ray.origin;
+        glm::vec3 ttop = temp1 * invDir;
+        glm::vec3 tbot = temp2 * invDir;
+
+        auto tmin = glm::vec3(std::min(ttop.x, tbot.x), std::min(ttop.y, tbot.y), std::min(ttop.z, tbot.z));
+        auto tmax = glm::vec3(std::max(ttop.x, tbot.x), std::max(ttop.y, tbot.y), std::max(ttop.z, tbot.z));
+
+        float t0 = std::max(tmin.x, (std::max)(tmin.y, tmin.z));
+        float t1 = std::min(tmax.x, (std::min)(tmax.y, tmax.z));
+        return t0 <= t1 && t1 >= 0;
+    }
+};
+
+
 struct Geom {
     enum GeomType type;
     int materialid;
@@ -26,6 +74,23 @@ struct Geom {
     glm::mat4 transform;
     glm::mat4 inverseTransform;
     glm::mat4 invTranspose;
+
+    glm::vec3 pos[3];
+    glm::vec3 normal[3];
+    glm::vec2 uv[3];
+    bool isObj{ false };
+
+    const char* textureName;
+    unsigned char* img;
+    int texture_width;
+    int texture_height;
+    int channels;
+
+    AABB bbox;
+    int obj_start_offset;
+    int obj_end;
+
+    glm::vec3 endPos;
 };
 
 struct Material {
@@ -34,10 +99,19 @@ struct Material {
         float exponent;
         glm::vec3 color;
     } specular;
-    float hasReflective;
-    float hasRefractive;
-    float indexOfRefraction;
-    float emittance;
+    float hasReflective{0};
+    float hasRefractive{0};
+    float indexOfRefraction{0};
+    float emittance{0};
+    float microfacet{0};
+    float roughness{0};
+    float metalness{ 0 };
+
+    const char* textureName;
+    unsigned char* img;
+    int texture_width;
+    int texture_height;
+    int channels;
 };
 
 struct Camera {
@@ -49,6 +123,8 @@ struct Camera {
     glm::vec3 right;
     glm::vec2 fov;
     glm::vec2 pixelLength;
+    float lensRadius;
+    float focalDistance;
 };
 
 struct RenderState {
@@ -73,4 +149,76 @@ struct ShadeableIntersection {
   float t;
   glm::vec3 surfaceNormal;
   int materialId;
+  glm::vec2 uv;
 };
+
+struct Triangle {
+    glm::vec3 pos[3];
+    glm::vec3 normal[3];
+    glm::vec2 uv[3];
+    int materialId;
+};
+
+
+static AABB Union(const AABB& b1, const AABB& b2) {
+    AABB ret;
+    ret.pMin = glm::vec3((std::min)(b1.pMin.x, b2.pMin.x),
+        (std::min)(b1.pMin.y, b2.pMin.y),
+        (std::min)(b1.pMin.z, b2.pMin.z));
+    ret.pMax = glm::vec3((std::max)(b1.pMax.x, b2.pMax.x),
+        (std::max)(b1.pMax.y, b2.pMax.y),
+        (std::max)(b1.pMax.z, b2.pMax.z));
+    return ret;
+}
+
+struct Obj {
+    AABB box;
+    Geom* data;
+};
+
+
+struct TriBounds {
+    glm::vec3 AABB_min;
+    glm::vec3 AABB_max;
+    glm::vec3 AABB_centroid;
+    int tri_ID;
+};
+
+struct BVHNode {
+    glm::vec3 AABB_min;
+    glm::vec3 AABB_max;
+    BVHNode* child_nodes[2];
+    int split_axis;
+    int tri_index;
+};
+
+struct BVHNode_GPU {
+    glm::vec3 AABB_min;
+    glm::vec3 AABB_max;
+    int tri_index;
+    int offset_to_second_child;
+    int axis;
+};
+
+struct Tri {
+    // positions
+    glm::vec3 p0;
+    glm::vec3 p1;
+    glm::vec3 p2;
+    // normals
+    glm::vec3 n0;
+    glm::vec3 n1;
+    glm::vec3 n2;
+    // uvs
+    glm::vec2 t0;
+    glm::vec2 t1;
+    glm::vec2 t2;
+    //transforms
+    glm::mat4 transform;
+    glm::mat4 inverseTransform;
+    glm::mat4 invTranspose;
+    // plane normal
+    glm::vec3 plane_normal;
+    float S;
+    int mat_ID;
+};
\ No newline at end of file