diff --git a/cmake/BuildSettings.cmake b/cmake/BuildSettings.cmake
index d0c4312..67babd3 100644
--- a/cmake/BuildSettings.cmake
+++ b/cmake/BuildSettings.cmake
@@ -3,7 +3,7 @@ if(${MSVC})
     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
     set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /debug")
-elseif(${UNIX})
+else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
     if(NOT ${CYGWIN})   # Don't specify -pthread on Cygwin
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
diff --git a/common/sema.h b/common/sema.h
index 62d17ba..39fa827 100644
--- a/common/sema.h
+++ b/common/sema.h
@@ -8,6 +8,7 @@
 
 #include <atomic>
 #include <cassert>
+#include <type_traits>
 
 
 #if defined(_WIN32)
@@ -15,14 +16,23 @@
 // Semaphore (Windows)
 //---------------------------------------------------------
 
-#include <windows.h>
-#undef min
-#undef max
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+    struct _SECURITY_ATTRIBUTES;
+    __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+    __declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+    __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+    __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
 
 class Semaphore
 {
 private:
-    HANDLE m_hSema;
+    void* m_hSema;
 
     Semaphore(const Semaphore& other) = delete;
     Semaphore& operator=(const Semaphore& other) = delete;
@@ -31,7 +41,8 @@ class Semaphore
     Semaphore(int initialCount = 0)
     {
         assert(initialCount >= 0);
-        m_hSema = CreateSemaphore(NULL, initialCount, MAXLONG, NULL);
+        const long maxLong = 0x7fffffff;
+        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
     }
 
     ~Semaphore()
@@ -41,12 +52,13 @@ class Semaphore
 
     void wait()
     {
-        WaitForSingleObject(m_hSema, INFINITE);
+        const unsigned long infinite = 0xffffffff;
+        WaitForSingleObject(m_hSema, infinite);
     }
 
     void signal(int count = 1)
     {
-        ReleaseSemaphore(m_hSema, count, NULL);
+        ReleaseSemaphore(m_hSema, count, nullptr);
     }
 };
 
@@ -164,13 +176,22 @@ class Semaphore
 //---------------------------------------------------------
 class LightweightSemaphore
 {
+public:
+    // The underlying semaphores are limited to int-sized counts,
+    // but there's no reason we can't scale higher on platforms with
+    // a wider size_t than int -- the only counts we pass on to the
+    // underlying semaphores are the number of waiting threads, which
+    // will always fit in an int for all platforms regardless of our
+    // high-level count.
+    typedef std::make_signed<std::size_t>::type ssize_t;
+    
 private:
-    std::atomic<int> m_count;
+    std::atomic<ssize_t> m_count;
     Semaphore m_sema;
 
     void waitWithPartialSpinning()
     {
-        int oldCount;
+        ssize_t oldCount;
         // Is there a better way to set the initial spin count?
         // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
         // as threads start hitting the kernel semaphore.
@@ -178,7 +199,7 @@ class LightweightSemaphore
         while (spin--)
         {
             oldCount = m_count.load(std::memory_order_relaxed);
-            if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire))
+            if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
                 return;
             std::atomic_signal_fence(std::memory_order_acquire);     // Prevent the compiler from collapsing the loop.
         }
@@ -189,16 +210,45 @@ class LightweightSemaphore
         }
     }
 
+    ssize_t waitManyWithPartialSpinning(ssize_t max)
+    {
+        assert(max > 0);
+        ssize_t oldCount;
+        int spin = 10000;
+        while (spin--)
+        {
+            oldCount = m_count.load(std::memory_order_relaxed);
+            if (oldCount > 0)
+            {
+                ssize_t newCount = oldCount > max ? oldCount - max : 0;
+                if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+                    return oldCount - newCount;
+            }
+            std::atomic_signal_fence(std::memory_order_acquire);
+        }
+        oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+        if (oldCount <= 0)
+            m_sema.wait();
+        if (max > 1)
+            return 1 + tryWaitMany(max - 1);
+        return 1;
+    }
+
 public:
-    LightweightSemaphore(int initialCount = 0) : m_count(initialCount)
+    LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
     {
         assert(initialCount >= 0);
     }
 
     bool tryWait()
     {
-        int oldCount = m_count.load(std::memory_order_relaxed);
-        return (oldCount > 0 && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire));
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+                return true;
+        }
+        return false;
     }
 
     void wait()
@@ -206,16 +256,47 @@ class LightweightSemaphore
         if (!tryWait())
             waitWithPartialSpinning();
     }
+    
+    // Acquires between 0 and (greedily) max, inclusive
+    ssize_t tryWaitMany(ssize_t max)
+    {
+        assert(max >= 0);
+        ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+        while (oldCount > 0)
+        {
+            ssize_t newCount = oldCount > max ? oldCount - max : 0;
+            if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+                return oldCount - newCount;
+        }
+        return 0;
+    }
+    
+    // Acquires at least one, and (greedily) at most max
+    ssize_t waitMany(ssize_t max)
+    {
+        assert(max >= 0);
+        ssize_t result = tryWaitMany(max);
+        if (result == 0 && max > 0)
+            result = waitManyWithPartialSpinning(max);
+        return result;
+    }
 
-    void signal(int count = 1)
+    void signal(ssize_t count = 1)
     {
-        int oldCount = m_count.fetch_add(count, std::memory_order_release);
-        int toRelease = -oldCount < count ? -oldCount : count;
+        assert(count >= 0);
+        ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+        ssize_t toRelease = -oldCount < count ? -oldCount : count;
         if (toRelease > 0)
         {
-            m_sema.signal(toRelease);
+            m_sema.signal((int)toRelease);
         }
     }
+    
+    ssize_t availableApprox() const
+    {
+        ssize_t count = m_count.load(std::memory_order_relaxed);
+        return count > 0 ? count : 0;
+    }
 };