pytorch · swolchok · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform.h
@@ -1,9 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 // (c) Meta Platforms, Inc. and affiliates.
 #pragma once
 
 #include <cassert>
 #include <cmath>
 #include <cstdint>
+#include <memory>
 
 #include "fast_hadamard_transform_special.h"
 
@@ -24,9 +33,7 @@ T fast_sqrt_of_power_of_2(int log2_n) {
 }
 
 template <typename T>
-void normalize_after_fht(
-    T* out,
-    int log2_vec_size) {
+void normalize_after_fht(T* out, int log2_vec_size) {
   const T inv_sqrt = T(1) / fast_sqrt_of_power_of_2<T>(log2_vec_size);
   const int vec_size = 1 << log2_vec_size;
   for (int ii = 0; ii < vec_size; ++ii) {
@@ -35,7 +42,7 @@ void normalize_after_fht(
 }
 
 template <typename T>
-void fast_hadamard_transform_simple_impl(
+void fast_hadamard_transform_unnormalized_simple_impl(
     T* vec,
     int log2_vec_size) {
   if (log2_vec_size == 0) {
@@ -55,7 +62,11 @@ void fast_hadamard_transform_simple_impl(
     }
     step *= 2;
   }
+}
 
+template <typename T>
+void fast_hadamard_transform_simple_impl(T* vec, int log2_vec_size) {
+  fast_hadamard_transform_unnormalized_simple_impl(vec, log2_vec_size);
   normalize_after_fht(vec, log2_vec_size);
 }
 
@@ -66,7 +77,73 @@ void fast_hadamard_transform_simple_impl(
 // of vec, which must be of length (1 << log2_vec_size).
 template <typename T>
 void fast_hadamard_transform(T* vec, int log2_vec_size) {
-    internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+  internal::fast_hadamard_transform_simple_impl(vec, log2_vec_size);
+}
+
+// Compute a quantized fast Walsh-Hadamard transform of vec, which
+// must be of length (1 << log2_vec_size) and symmetrically quantized.
+//
+// Note that we do not need to know the quantization scale, because
+// the Fast Hadamard transform is a series of additions and
+// subtractions with a final multiplication step, and we have the
+// following trivial identities:
+//
+// scale * a + scale * b = scale * (a + b)  (addition doesn't need the scale)
+// alpha * (scale * a) = scale * (alpha * a) (multiplication doesn't need the
+// scale)
+void fast_hadamard_transform_symmetric_quantized_s16(
+    int16_t* vec,
+    int log2_vec_size) {
+  if (log2_vec_size == 0) {
+    return;
+  }
+
+  const int vec_size = 1 << log2_vec_size;
+  // We perform log2_vec_size rounds where each round's maximum output
+  // is at most double the maximum input, so we can at most multiply
+  // the maximum input by vec_size. Performing intermediate arithmetic
+  // in 32-bit precision should prevent overflow, since 16 +
+  // log2_vec_size should be much less than 32.
+  auto tmp = std::make_unique<int32_t[]>(vec_size);
+  std::copy(vec, vec + vec_size, tmp.get());
+
+  // Per the function-level comment above, we can ignore the
+  // quantization scale, so we just delegate to the usual unnormalized
+  // implementation.
+  // NOTE: if we need this to be fast on CPU, we can use FFHT to
+  // generate fht_uint32 similar to fht_float.
+  internal::fast_hadamard_transform_unnormalized_simple_impl(
+      tmp.get(), log2_vec_size);
+
+  // Normalization step: divide by sqrt(1 << log2_vec_size). Similar
+  // to fast_sqrt above, if N is even, then the maximum-precision way
+  // to do this is right-shift by log2_vec_size / 2. If N is odd, we
+  // still do the right-shift, and then we have an extra division by
+  // sqrt(2) that we perform by making use of a sufficiently accurate
+  // rational approximation. Our initial idea was to divide by sqrt(2)
+  // by adjusting the quantization scale, but that would cause this
+  // function to tend to increase the magnitude of the elements of
+  // vec, which would resulting in clipping and therefore accuracy
+  // loss, especially compounded over 30+ transformer layers.
+  const int log2_sqrt_vec_size = log2_vec_size / 2;
+  constexpr int32_t qmin = -(1 << 15) + 1;
+  constexpr int32_t qmax = -qmin;
+  if (log2_vec_size % 2 != 0) {
+    // 408 / 577 - 1.0 / sqrt(2) ~= 1.062e-0.6, which should be close enough.
+    static const int32_t inv_sqrt_2_numerator = 408;
+    static const int32_t inv_sqrt_2_denominator = 577;
+    for (int ii = 0; ii < vec_size; ++ii) {
+      const auto val_over_sqrt_vec_size =
+          (tmp[ii] * inv_sqrt_2_numerator / inv_sqrt_2_denominator) >>
+          log2_sqrt_vec_size;
+      vec[ii] = std::clamp(val_over_sqrt_vec_size, qmin, qmax);
+    }
+  } else {
+    for (int ii = 0; ii < vec_size; ++ii) {
+      vec[ii] = std::clamp(tmp[ii] >> log2_sqrt_vec_size, qmin, qmax);
+    }
+  }
+  return;
 }
 
 // Like fast_hadamard_transform, but vec must be of length 28 * (1 <<

diff --git a/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h b/extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h
@@ -1,5 +1,4 @@
-
-// This file is auto-generated. See "special_hadamard_code_gen.py"
+// @generated by special_hadamard_code_gen.py strided_cpu
 
 
 #pragma once

diff --git a/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py b/extension/llm/custom_ops/spinquant/special_hadamard_code_gen.py
@@ -176,12 +176,12 @@
 had_strings = [had_12, had_20_will, had_28_will, had_40_tpal]
 
 header = """
-// This file is auto-generated. See "special_hadamard_code_gen.py"\n
 
 #pragma once
 
 """
 
+
 TEMPLATE = """
 __device__ __forceinline__ void hadamard_mult_thread_{N}(float x[{N}]) {{
     float out[{N}];
@@ -220,8 +220,13 @@
 
 def string_to_array(string):
     # Convert strings of + and - to bool arrays
-    string = string.strip().replace('+', '1').replace('-', '-1').split()
-    return np.stack([np.fromstring(" ".join(string[i]), dtype=np.int32, sep=' ') for i in range(len(string))])
+    string = string.strip().replace("+", "1").replace("-", "-1").split()
+    return np.stack(
+        [
+            np.fromstring(" ".join(string[i]), dtype=np.int32, sep=" ")
+            for i in range(len(string))
+        ]
+    )
 
 
 def strided_load_code_gen(N):
@@ -233,28 +238,44 @@ def array_code_gen(arr, template):
     assert arr.shape[0] == arr.shape[1]
     out = []
     for i in range(N):
-        out.append(f"out[{i}] = " + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)]) + ";")
-    return template.format(N=str(N), code='\n    '.join(out), strided_load_code = strided_load_code_gen(N))
-
-
-def main(template = TEMPLATE):
-    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
-    output_dir.write_text(header + ''.join(array_code_gen(string_to_array(s), template) for s in had_strings))
+        out.append(
+            f"out[{i}] = "
+            + " ".join([f"{'+' if arr[i, j] == 1 else '-'} x[{j}]" for j in range(N)])
+            + ";"
+        )
+    return template.format(
+        N=str(N), code="\n    ".join(out), strided_load_code=strided_load_code_gen(N)
+    )
 
 
 OPTION_TO_TEMPLATE = {
-    'cuda': TEMPLATE,
-    'cpu': CPU_TEMPLATE,
-    'strided_cpu': STRIDED_CPU_TEMPLATE,
+    "cuda": TEMPLATE,
+    "cpu": CPU_TEMPLATE,
+    "strided_cpu": STRIDED_CPU_TEMPLATE,
 }
 
 
-if __name__ == '__main__':
+def main(option="cuda"):
+    try:
+        template = OPTION_TO_TEMPLATE[option]
+    except KeyError:
+        raise Exception(
+            f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}"
+        )
+    output_dir = Path(__file__).parent / "fast_hadamard_transform_special.h"
+    generated_line = f"// @{'generated'} by special_hadamard_code_gen.py {option}\n"
+
+    output_dir.write_text(
+        generated_line
+        + header
+        + "".join(array_code_gen(string_to_array(s), template) for s in had_strings)
+    )
+
+
+if __name__ == "__main__":
     import sys
-    template = TEMPLATE
+
+    option = "cuda"
     if len(sys.argv) > 1:
         option = sys.argv[1]
-        if option not in OPTION_TO_TEMPLATE:
-            raise Exception(f"bad target option {option}; options are {', '.join(OPTION_TO_TEMPLATE.keys())}")
-        template = OPTION_TO_TEMPLATE[option]
-    main(template)
+    main(option)
diff --git a/extension/llm/custom_ops/spinquant/targets.bzl b/extension/llm/custom_ops/spinquant/targets.bzl
@@ -8,8 +8,9 @@ def define_common_targets():
     """
     runtime.cxx_library(
         name = "fast_hadamard_transform",
-        headers = [
+        exported_headers = [
             "fast_hadamard_transform.h",
             "fast_hadamard_transform_special.h",
         ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
     )