pinterf · ignus2 · Aug 23, 2016 · Aug 23, 2016 · Aug 23, 2016 · Aug 24, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -36,6 +36,7 @@ IF( MSVC_IDE )  # Check for Visual Studio
   # Enable C++ with SEH exceptions
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /EHa")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHa")
+  # or add_compile_options( /EHa ) for CMake>=3?
 
   # Prevent VC++ from complaining about not using MS-specific functions
   add_definitions("/D _CRT_SECURE_NO_WARNINGS /D _SECURE_SCL=0")
@@ -48,8 +49,9 @@ IF( MSVC_IDE )  # Check for Visual Studio
   if(CMAKE_SIZEOF_VOID_P EQUAL 4)
     # VC++ enables the SSE2 instruction set by default even on 32-bits. Step back a bit.
     add_definitions("/arch:SSE")
+    #add_definitions("/arch:SSE2") # Better use this one, it's 2016 now
   endif() 
-  
+
   # Set additional optimization flags
   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Oy /Ot /GS-")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oy /Ot /GS-")

diff --git a/avs_core/CMakeLists.txt b/avs_core/CMakeLists.txt
@@ -25,6 +25,14 @@ foreach(FILE ${AvsCore_Sources})
   source_group("${GROUP}" FILES "${FILE}")
 endforeach()
 
+# special AVX option for source files with *_avx.cpp pattern
+file(GLOB_RECURSE SRCS_AVX "*_avx.cpp")
+set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " /arch:AVX ")
+
+# special AVX2 option for source files with *_avx2.cpp pattern
+file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp")
+set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " /arch:AVX2 ")
+
 # Specify include directories
 target_include_directories("AvsCore" PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 # Specify preprocessor definitions

diff --git a/avs_core/convert/convert.cpp b/avs_core/convert/convert.cpp
diff --git a/avs_core/convert/convert.h b/avs_core/convert/convert.h
@@ -37,7 +37,7 @@
 
 #include "../core/internal.h"
 
-enum {Rec601=0, Rec709=1, PC_601=2, PC_709=3, AVERAGE=4 };
+enum {Rec601=0, Rec709=1, PC_601=2, PC_709=3, AVERAGE=4, Rec2020=5 };
 int getMatrix( const char* matrix, IScriptEnvironment* env);
 
 /*****************************************************
@@ -66,7 +66,7 @@ inline int RGB2YUV(int rgb)
  *******   Colorspace GenericVideoFilter Classes   ******
  *******************************************************/
 
-
+// YUY2 only
 class ConvertToRGB : public GenericVideoFilter 
 /**
   * Class to handle conversion to RGB & RGBA
@@ -80,15 +80,14 @@ class ConvertToRGB : public GenericVideoFilter
     return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0;
   }
 
-  static AVSValue __cdecl Create(AVSValue args, void*, IScriptEnvironment* env);  
-  static AVSValue __cdecl Create32(AVSValue args, void*, IScriptEnvironment* env);
-  static AVSValue __cdecl Create24(AVSValue args, void*, IScriptEnvironment* env);
+  static AVSValue __cdecl Create(AVSValue args, void* user_data, IScriptEnvironment* env);
 
 private:
   int theMatrix;
-  enum {Rec601=0, Rec709=1, PC_601=2, PC_709=3 };	
+  enum {Rec601=0, Rec709=1, PC_601=2, PC_709=3};	
 };
 
+// YUY2 only
 class ConvertToYV12 : public GenericVideoFilter 
 /**
   * Class for conversions to YV12

diff --git a/avs_core/convert/convert_avx.cpp b/avs_core/convert/convert_avx.cpp
@@ -0,0 +1,121 @@
+// Avisynth v2.5.  Copyright 2002-2009 Ben Rudiak-Gould et al.
+// http://www.avisynth.org
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+
+#include "convert.h"
+#include "convert_planar.h"
+#include "convert_rgb.h"
+#include "convert_yv12.h"
+#include "convert_yuy2.h"
+#include <avs/alignment.h>
+#include <avs/win.h>
+#include <avs/minmax.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tuple>
+#include <map>
+
+#include "convert_avx.h"
+
+template<typename pixel_t, uint8_t targetbits>
+void convert_32_to_uintN_c_avx(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range)
+{
+  const float *srcp0 = reinterpret_cast<const float *>(srcp);
+  pixel_t *dstp0 = reinterpret_cast<pixel_t *>(dstp);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(pixel_t);
+
+  int src_width = src_rowsize / sizeof(float);
+
+  float max_dst_pixelvalue = (float)((1<<targetbits) - 1); // 255, 1023, 4095, 16383, 65535.0
+
+  float factor = 1.0f / float_range * max_dst_pixelvalue;
+
+  for(int y=0; y<src_height; y++)
+  {
+    for (int x = 0; x < src_width; x++)
+    {
+      float pixel = srcp0[x] * factor + 0.5f; // 0.5f: keep the neutral grey level of float 0.5
+      dstp0[x] = pixel_t(clamp(pixel, 0.0f, max_dst_pixelvalue)); // we clamp here!
+    }
+    dstp0 += dst_pitch;
+    srcp0 += src_pitch;
+  }
+  _mm256_zeroupper();
+}
+
+template void convert_32_to_uintN_c_avx<uint8_t, 8>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx<uint16_t, 10>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx<uint16_t, 12>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx<uint16_t, 14>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx<uint16_t, 16>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
+
+// YUV: bit shift 10-12-14-16 <=> 10-12-14-16 bits
+// shift right or left, depending on expandrange template param
+template<bool expandrange, uint8_t shiftbits>
+void convert_uint16_to_uint16_c_avx(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range)
+{
+    const uint16_t *srcp0 = reinterpret_cast<const uint16_t *>(srcp);
+    uint16_t *dstp0 = reinterpret_cast<uint16_t *>(dstp);
+
+    src_pitch = src_pitch / sizeof(uint16_t);
+    dst_pitch = dst_pitch / sizeof(uint16_t);
+
+    const int src_width = src_rowsize / sizeof(uint16_t);
+
+    for(int y=0; y<src_height; y++)
+    {
+        for (int x = 0; x < src_width; x++)
+        {
+            if(expandrange)
+                dstp0[x] = srcp0[x] << shiftbits;  // expand range. No clamp before, source is assumed to have valid range
+            else
+                dstp0[x] = srcp0[x] >> shiftbits;  // reduce range
+        }
+        dstp0 += dst_pitch;
+        srcp0 += src_pitch;
+    }
+    _mm256_zeroupper();
+}
+
+// instantiate them
+template void convert_uint16_to_uint16_c_avx<false, 2>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx<false, 4>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx<false, 6>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx<true, 2>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx<true, 4>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx<true, 6>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
diff --git a/avs_core/convert/convert_avx.h b/avs_core/convert/convert_avx.h
@@ -0,0 +1,46 @@
+// Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
+// http://www.avisynth.org
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+#ifndef __Convert_AVX_H__
+#define __Convert_AVX_H__
+
+#include "../core/internal.h"
+
+template<bool expandrange, uint8_t shiftbits>
+void convert_uint16_to_uint16_c_avx(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
+template<typename pixel_t, uint8_t targetbits>
+void convert_32_to_uintN_c_avx(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
+#endif  // __Convert_AVX_H__
diff --git a/avs_core/convert/convert_avx2.cpp b/avs_core/convert/convert_avx2.cpp
@@ -0,0 +1,120 @@
+// Avisynth v2.5.  Copyright 2002-2009 Ben Rudiak-Gould et al.
+// http://www.avisynth.org
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// Linking Avisynth statically or dynamically with other modules is making a
+// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
+// General Public License cover the whole combination.
+//
+// As a special exception, the copyright holders of Avisynth give you
+// permission to link Avisynth with independent modules that communicate with
+// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
+// terms of these independent modules, and to copy and distribute the
+// resulting combined work under terms of your choice, provided that
+// every copy of the combined work is accompanied by a complete copy of
+// the source code of Avisynth (the version of Avisynth used to produce the
+// combined work), being distributed under the terms of the GNU General
+// Public License plus this exception.  An independent module is a module
+// which is not derived from or based on Avisynth, such as 3rd-party filters,
+// import and export plugins, or graphical user interfaces.
+
+
+#include "convert.h"
+#include "convert_planar.h"
+#include "convert_rgb.h"
+#include "convert_yv12.h"
+#include "convert_yuy2.h"
+#include <avs/alignment.h>
+#include <avs/win.h>
+#include <avs/minmax.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tuple>
+#include <map>
+
+#include "convert_avx2.h"
+
+template<typename pixel_t, uint8_t targetbits>
+void convert_32_to_uintN_c_avx2(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range)
+{
+  const float *srcp0 = reinterpret_cast<const float *>(srcp);
+  pixel_t *dstp0 = reinterpret_cast<pixel_t *>(dstp);
+
+  src_pitch = src_pitch / sizeof(float);
+  dst_pitch = dst_pitch / sizeof(pixel_t);
+
+  int src_width = src_rowsize / sizeof(float);
+
+  float max_dst_pixelvalue = (float)((1<<targetbits) - 1); // 255, 1023, 4095, 16383, 65535.0
+
+  float factor = 1.0f / float_range * max_dst_pixelvalue;
+
+  for(int y=0; y<src_height; y++)
+  {
+    for (int x = 0; x < src_width; x++)
+    {
+      float pixel = srcp0[x] * factor + 0.5f; // 0.5f: keep the neutral grey level of float 0.5
+      dstp0[x] = pixel_t(clamp(pixel, 0.0f, max_dst_pixelvalue)); // we clamp here!
+    }
+    dstp0 += dst_pitch;
+    srcp0 += src_pitch;
+  }
+}
+
+template void convert_32_to_uintN_c_avx2<uint8_t, 8>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx2<uint16_t, 10>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx2<uint16_t, 12>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx2<uint16_t, 14>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_32_to_uintN_c_avx2<uint16_t, 16>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
+// YUV: bit shift 10-12-14-16 <=> 10-12-14-16 bits
+// shift right or left, depending on expandrange template param
+template<bool expandrange, uint8_t shiftbits>
+void convert_uint16_to_uint16_c_avx2(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range)
+{
+    const uint16_t *srcp0 = reinterpret_cast<const uint16_t *>(srcp);
+    uint16_t *dstp0 = reinterpret_cast<uint16_t *>(dstp);
+
+    src_pitch = src_pitch / sizeof(uint16_t);
+    dst_pitch = dst_pitch / sizeof(uint16_t);
+
+    const int src_width = src_rowsize / sizeof(uint16_t);
+
+    for(int y=0; y<src_height; y++)
+    {
+        for (int x = 0; x < src_width; x++)
+        {
+            if(expandrange)
+                dstp0[x] = srcp0[x] << shiftbits;  // expand range. No clamp before, source is assumed to have valid range
+            else
+                dstp0[x] = srcp0[x] >> shiftbits;  // reduce range
+        }
+        dstp0 += dst_pitch;
+        srcp0 += src_pitch;
+    }
+    // Anti-sse2-avx penalty vzeroupper (_mm256_zeroupper()) is automatically placed here if ymm registers are used
+}
+
+// instantiate them
+template void convert_uint16_to_uint16_c_avx2<false, 2>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx2<false, 4>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx2<false, 6>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx2<true, 2>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx2<true, 4>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+template void convert_uint16_to_uint16_c_avx2<true, 6>(const BYTE *srcp, BYTE *dstp, int src_rowsize, int src_height, int src_pitch, int dst_pitch, float float_range);
+
+