Skip to content

Commit dfd4f5f

Browse files
committed
Add COMPLEX_RETSTYLE_FNDA for Windows x64
Windows x64 automatically forces return values onto the stack if they are larger than 64 bits wide [0]. This causes return values from e.g. `zdotc` to be pushed onto a secret first argument, but not the return values from e.g. `cdotc`. To address this, we add a new complex return style, "Float Normal, Double Argument", to specify that `complex float`-returning functions use the normal return style, whereas `complex double`-returning functions use the argument return style. This should fix JuliaLinearAlgebra/BLISBLAS.jl#15 [0] https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
1 parent 07c3509 commit dfd4f5f

8 files changed

+151
-51
lines changed

src/autodetection.c

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,9 @@ int32_t autodetect_complex_return_style(void * handle, const char * suffix) {
214214
if (env_lowercase_match("LBT_FORCE_RETSTYLE", "argument")) {
215215
return LBT_COMPLEX_RETSTYLE_ARGUMENT;
216216
}
217+
if (env_lowercase_match("LBT_FORCE_RETSTYLE", "fnda")) {
218+
return LBT_COMPLEX_RETSTYLE_FNDA;
219+
}
217220
char symbol_name[MAX_SYMBOL_LEN];
218221

219222
build_symbol_name(symbol_name, "zdotc_", suffix);
@@ -222,37 +225,84 @@ int32_t autodetect_complex_return_style(void * handle, const char * suffix) {
222225
return LBT_COMPLEX_RETSTYLE_UNKNOWN;
223226
}
224227

228+
build_symbol_name(symbol_name, "cdotc_", suffix);
229+
void * cdotc_addr = lookup_symbol(handle, symbol_name);
230+
if (cdotc_addr == NULL) {
231+
return LBT_COMPLEX_RETSTYLE_UNKNOWN;
232+
}
233+
225234
// Typecast to function pointer for easier usage below
226235
double complex (*zdotc_normal)( int64_t *, double complex *, int64_t *, double complex *, int64_t *) = zdotc_addr;
227236
void (*zdotc_retarg)(double complex *, int64_t *, double complex *, int64_t *, double complex *, int64_t *) = zdotc_addr;
228237

238+
// Typecast to function pointer for easier usage below
239+
float complex (*cdotc_normal)( int64_t *, float complex *, int64_t *, float complex *, int64_t *) = cdotc_addr;
240+
void (*cdotc_retarg)(float complex *, int64_t *, float complex *, int64_t *, float complex *, int64_t *) = cdotc_addr;
241+
229242
/*
230243
* First, check to see if `zdotc` zeros out the first argument if all arguments are zero.
231244
* Supposedly, most well-behaved implementations will return `0 + 0*I` if the length of
232245
* the inputs is zero; so if it is using a "return argument", that's a good way to find out.
233246
*
234-
* We detect this by setting `retval` to an initial value of `0.0 + 1.0*I`. This has the
235-
* added benefit of being interpretable as `0` if looked at as an `int{32,64}_t *`, which
236-
* makes this invocation safe across the full normal-return/argument-return vs. lp64/ilp64
237-
* compatibility square.
247+
* We detect this by setting `retval` to an initial value of `-1` typecast to a complex
248+
* value. The floating-point values are unimportant as they will be written to, but if
249+
* it is interpreted as an `int{32,64}_t`, it will be a negative value (which is not
250+
* allowed and should end the routine immediately). This makes this invocation safe
251+
* across the full normal/argument, lp64/ilp64, cdotc/zdotc compatibility cube.
238252
*/
239-
double complex retval = 0.0 + 1.0*I;
253+
double complex retval_double = 0.0 + 1.0*I;
240254
int64_t zero = 0;
241-
double complex zeroc = 0.0 + 0.0*I;
242-
zdotc_retarg(&retval, &zero, &zeroc, &zero, &zeroc, &zero);
255+
double complex zeroc_double = 0.0 + 0.0*I;
256+
zdotc_retarg(&retval_double, &zero, &zeroc_double, &zero, &zeroc_double, &zero);
243257

244-
if (creal(retval) == 0.0 && cimag(retval) == 0.0) {
245-
return LBT_COMPLEX_RETSTYLE_ARGUMENT;
258+
/*
259+
* Next, do the same with `cdotc`, in order to detect situations where the ABI is
260+
* automatically inserting an extra argument to return 128-bit-wide values.
261+
* We call this `FNDA` for "Float Normal, Double Argument" style.
262+
*/
263+
int64_t neg1 = -1;
264+
float complex retval_float = *(complex float *)(&neg1);
265+
float complex zeroc_float = 0.0f + 0.0f*I;
266+
cdotc_retarg(&retval_float, &zero, &zeroc_float, &zero, &zeroc_float, &zero);
267+
268+
if (creal(retval_double) == 0.0 && cimag(retval_double) == 0.0) {
269+
// If the double values were reset, and the float values were also,
270+
// this is easy, we're just always argument-style:
271+
if (creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f) {
272+
return LBT_COMPLEX_RETSTYLE_ARGUMENT;
273+
}
274+
275+
// If the float values were not, let's try the normal return style:
276+
retval_float = 0.0f + 1.0f*I;
277+
retval_float = cdotc_normal(&zero, &zeroc_float, &zero, &zeroc_float, &zero);
278+
279+
280+
// If this works, we are in FNDA style (currently only observed on Windows x64)
281+
if (creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f) {
282+
return LBT_COMPLEX_RETSTYLE_FNDA;
283+
}
284+
285+
// Otherwise, cdotc is throwing a fit and we don't know what's up.
286+
return LBT_COMPLEX_RETSTYLE_UNKNOWN;
246287
}
247288

248-
// If it was _not_ reset, let's hazard a guess that we're dealing with a normal return style:
249-
retval = 0.0 + 1.0*I;
250-
retval = zdotc_normal(&zero, &zeroc, &zero, &zeroc, &zero);
251-
if (creal(retval) == 0.0 && cimag(retval) == 0.0) {
289+
// If our double values were _not_ reset, let's hazard a guess that
290+
// we're dealing with a normal return style and test both types again:
291+
retval_double = 0.0 + 1.0*I;
292+
retval_double = zdotc_normal(&zero, &zeroc_double, &zero, &zeroc_double, &zero);
293+
retval_float = 0.0f + 1.0f*I;
294+
retval_float = cdotc_normal(&zero, &zeroc_float, &zero, &zeroc_float, &zero);
295+
296+
297+
// We only test for both working; we don't have a retstyle for float
298+
// being argument style and double being normal style.
299+
if ((creal(retval_double) == 0.0 && cimag(retval_double) == 0.0) &&
300+
(creal(retval_float) == 0.0f && cimag(retval_float) == 0.0f)) {
252301
return LBT_COMPLEX_RETSTYLE_NORMAL;
253302
}
254303

255-
// If that was not reset either, we have no idea what's going on.
304+
// If we get here, zdotc and cdotc are being uncooperative and we
305+
// do not appreciate it at all, not we don't my precious.
256306
return LBT_COMPLEX_RETSTYLE_UNKNOWN;
257307
}
258308
#endif // COMPLEX_RETSTYLE_AUTODETECTION

src/cblas_adapters.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ void lbt_cblas_cdotc_sub(const int32_t N,
7373
}
7474

7575
extern float complex cdotc_64_(const int64_t *,
76-
const float complex *, const int64_t *,
77-
const float complex *, const int64_t *);
76+
const float complex *, const int64_t *,
77+
const float complex *, const int64_t *);
7878
void lbt_cblas_cdotc_sub64_(const int64_t N,
7979
const float complex *X, const int64_t incX,
8080
const float complex *Y, const int64_t incY,

src/complex_return_style_adapters.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ extern void (*cmplxret_cdotc__addr)(float complex * z,
7272
const float complex *, const int32_t *,
7373
const float complex *, const int32_t *);
7474
float complex cmplxret_cdotc_(const int32_t * N,
75-
const float complex *X, const int32_t * incX,
76-
const float complex *Y, const int32_t * incY)
75+
const float complex *X, const int32_t * incX,
76+
const float complex *Y, const int32_t * incY)
7777
{
7878
float complex c;
7979
cmplxret_cdotc__addr(&c, N, X, incX, Y, incY);
@@ -85,8 +85,8 @@ extern void (*cmplxret_cdotc_64__addr)(float complex * z,
8585
const float complex *, const int64_t *,
8686
const float complex *, const int64_t *);
8787
float complex cmplxret_cdotc_64_(const int64_t * N,
88-
const float complex *X, const int64_t * incX,
89-
const float complex *Y, const int64_t * incY)
88+
const float complex *X, const int64_t * incX,
89+
const float complex *Y, const int64_t * incY)
9090
{
9191
float complex c;
9292
cmplxret_cdotc_64__addr(&c, N, X, incX, Y, incY);
@@ -100,8 +100,8 @@ extern void (*cmplxret_cdotu__addr)(float complex * z,
100100
const float complex *, const int32_t *,
101101
const float complex *, const int32_t *);
102102
float complex cmplxret_cdotu_(const int32_t * N,
103-
const float complex *X, const int32_t * incX,
104-
const float complex *Y, const int32_t * incY)
103+
const float complex *X, const int32_t * incX,
104+
const float complex *Y, const int32_t * incY)
105105
{
106106
float complex c;
107107
cmplxret_cdotu__addr(&c, N, X, incX, Y, incY);
@@ -113,8 +113,8 @@ extern void (*cmplxret_cdotu_64__addr)(float complex * z,
113113
const float complex *, const int64_t *,
114114
const float complex *, const int64_t *);
115115
float complex cmplxret_cdotu_64_(const int64_t * N,
116-
const float complex *X, const int64_t * incX,
117-
const float complex *Y, const int64_t * incY)
116+
const float complex *X, const int64_t * incX,
117+
const float complex *Y, const int64_t * incY)
118118
{
119119
float complex c;
120120
cmplxret_cdotu_64__addr(&c, N, X, incX, Y, incY);

src/libblastrampoline.c

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -70,26 +70,29 @@ int32_t set_forward_by_index(int32_t symbol_idx, const void * addr, int32_t inte
7070
}
7171

7272
#ifdef COMPLEX_RETSTYLE_AUTODETECTION
73-
if (complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) {
74-
// Check to see if this symbol is one of the complex-returning functions
75-
for (int complex_symbol_idx=0; cmplxret_func_idxs[complex_symbol_idx] != -1; ++complex_symbol_idx) {
76-
// Skip any symbols that aren't ours
77-
if (cmplxret_func_idxs[complex_symbol_idx] != symbol_idx)
78-
continue;
79-
80-
// Report to the user that we're cblas-wrapping this one
81-
if (verbose) {
82-
char exported_name[MAX_SYMBOL_LEN];
83-
build_symbol_name(exported_name, exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : "");
84-
printf(" - [%04d] complex(%s)\n", symbol_idx, exported_name);
85-
}
73+
for (int array_idx=0; array_idx < sizeof(cmplxret_func_idxs)/sizeof(int *); ++array_idx) {
74+
if ((complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) ||
75+
((complex_retstyle == LBT_COMPLEX_RETSTYLE_FNDA) && array_idx == 1)) {
76+
// Check to see if this symbol is one of the complex-returning functions
77+
for (int complex_symbol_idx=0; cmplxret_func_idxs[array_idx][complex_symbol_idx] != -1; ++complex_symbol_idx) {
78+
// Skip any symbols that aren't ours
79+
if (cmplxret_func_idxs[array_idx][complex_symbol_idx] != symbol_idx)
80+
continue;
81+
82+
// Report to the user that we're cmplxret-wrapping this one
83+
if (verbose) {
84+
char exported_name[MAX_SYMBOL_LEN];
85+
build_symbol_name(exported_name, exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : "");
86+
printf(" - [%04d] complex(%s)\n", symbol_idx, exported_name);
87+
}
8688

87-
if (interface == LBT_INTERFACE_LP64) {
88-
(*cmplxret_func32_addrs[complex_symbol_idx]) = (*exported_func32_addrs[symbol_idx]);
89-
(*exported_func32_addrs[symbol_idx]) = cmplxret32_func_wrappers[complex_symbol_idx];
90-
} else {
91-
(*cmplxret_func64_addrs[complex_symbol_idx]) = (*exported_func64_addrs[symbol_idx]);
92-
(*exported_func64_addrs[symbol_idx]) = cmplxret64_func_wrappers[complex_symbol_idx];
89+
if (interface == LBT_INTERFACE_LP64) {
90+
(*cmplxret_func32_addrs[array_idx][complex_symbol_idx]) = (*exported_func32_addrs[symbol_idx]);
91+
(*exported_func32_addrs[symbol_idx]) = cmplxret_func32_wrappers[array_idx][complex_symbol_idx];
92+
} else {
93+
(*cmplxret_func64_addrs[array_idx][complex_symbol_idx]) = (*exported_func64_addrs[symbol_idx]);
94+
(*exported_func64_addrs[symbol_idx]) = cmplxret_func64_wrappers[array_idx][complex_symbol_idx];
95+
}
9396
}
9497
}
9598
}

src/libblastrampoline.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,13 @@ typedef struct {
8585
// Possible values for `retstyle` in `lbt_library_info_t`
8686
// These describe whether a library is using "normal" return value passing (e.g. through
8787
// the `XMM{0,1}` registers on x86_64, or the `ST{0,1}` floating-point registers on i686)
88+
// This is further complicated by the fact that on certain platforms (such as Windows x64
89+
// this is dependent on the size of the value being returned, e.g. a complex64 value will
90+
// be returned through registers, but a complex128 value will not. We therefore have a
91+
// special value that denotes this situation)
8892
#define LBT_COMPLEX_RETSTYLE_NORMAL 0
8993
#define LBT_COMPLEX_RETSTYLE_ARGUMENT 1
94+
#define LBT_COMPLEX_RETSTYLE_FNDA 2 // "Float Normal, Double Argument"
9095
#define LBT_COMPLEX_RETSTYLE_UNKNOWN -1
9196

9297
// Possible values for `cblas` in `lbt_library_info_t`

src/libblastrampoline_complex_retdata.h

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,35 @@ COMPLEX128_FUNCS(XX_64)
1111
// Build mapping from cmplxret-index to `_addr` instance
1212
#define XX(name, index) &cmplxret_##name##_addr,
1313
#define XX_64(name, index) &cmplxret_##name##64__addr,
14-
const void ** cmplxret_func32_addrs[] = {
14+
const void ** cmplx64ret_func32_addrs[] = {
1515
COMPLEX64_FUNCS(XX)
16+
NULL
17+
};
18+
const void ** cmplx128ret_func32_addrs[] = {
1619
COMPLEX128_FUNCS(XX)
1720
NULL
1821
};
19-
const void ** cmplxret_func64_addrs[] = {
22+
const void ** cmplx64ret_func64_addrs[] = {
2023
COMPLEX64_FUNCS(XX_64)
24+
NULL
25+
};
26+
const void ** cmplx128ret_func64_addrs[] = {
2127
COMPLEX128_FUNCS(XX_64)
2228
NULL
2329
};
2430
#undef XX
2531
#undef XX_64
2632

33+
const void *** cmplxret_func32_addrs[] = {
34+
cmplx64ret_func32_addrs,
35+
cmplx128ret_func32_addrs
36+
};
37+
const void *** cmplxret_func64_addrs[] = {
38+
cmplx64ret_func64_addrs,
39+
cmplx128ret_func64_addrs
40+
};
41+
42+
2743

2844
// Forward-declare some functions
2945
#define XX(name, index) extern const void * cmplxret_##name ;
@@ -40,24 +56,49 @@ COMPLEX128_FUNCS(XX_64)
4056
// locations, allowing a cblas index -> function lookup
4157
#define XX(name, index) &cmplxret_##name,
4258
#define XX_64(name, index) &cmplxret_##name##64_,
43-
const void ** cmplxret32_func_wrappers[] = {
59+
const void ** cmplx64ret_func32_wrappers[] = {
4460
COMPLEX64_FUNCS(XX)
61+
NULL
62+
};
63+
const void ** cmplx128ret_func32_wrappers[] = {
4564
COMPLEX128_FUNCS(XX)
4665
NULL
4766
};
48-
const void ** cmplxret64_func_wrappers[] = {
67+
const void ** cmplx64ret_func64_wrappers[] = {
4968
COMPLEX64_FUNCS(XX_64)
69+
NULL
70+
};
71+
const void ** cmplx128ret_func64_wrappers[] = {
5072
COMPLEX128_FUNCS(XX_64)
5173
NULL
5274
};
5375
#undef XX
5476
#undef XX_64
5577

56-
// Finally, an array that maps cblas index -> exported symbol index
78+
const void *** cmplxret_func32_wrappers[] = {
79+
cmplx64ret_func32_wrappers,
80+
cmplx128ret_func32_wrappers
81+
};
82+
const void *** cmplxret_func64_wrappers[] = {
83+
cmplx64ret_func64_wrappers,
84+
cmplx128ret_func64_wrappers
85+
};
86+
87+
88+
89+
// Finally, an array that maps cmplxret index -> exported symbol index
5790
#define XX(name, index) index,
58-
const int cmplxret_func_idxs[] = {
91+
const int cmplx64ret_func_idxs[] = {
5992
COMPLEX64_FUNCS(XX)
93+
-1
94+
};
95+
const int cmplx128ret_func_idxs[] = {
6096
COMPLEX128_FUNCS(XX)
6197
-1
6298
};
63-
#undef XX
99+
#undef XX
100+
101+
const int * cmplxret_func_idxs[] = {
102+
cmplx64ret_func_idxs,
103+
cmplx128ret_func_idxs
104+
};

test/direct.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ lbt_handle = dlopen("$(lbt_prefix)/$(binlib)/lib$(lbt_link_name).$(shlib_ext)",
7171
@test libs[1].f2c == LBT_F2C_PLAIN
7272
if Sys.ARCH (:x86_64, :aarch64)
7373
if Sys.iswindows()
74-
@test libs[1].complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT
74+
@test libs[1].complex_retstyle == LBT_COMPLEX_RETSTYLE_FNDA
7575
else
7676
@test libs[1].complex_retstyle == LBT_COMPLEX_RETSTYLE_NORMAL
7777
end

test/utils.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ const LBT_INTERFACE_ILP64 = 64
147147
const LBT_F2C_PLAIN = 0
148148
const LBT_COMPLEX_RETSTYLE_NORMAL = 0
149149
const LBT_COMPLEX_RETSTYLE_ARGUMENT = 1
150+
const LBT_COMPLEX_RETSTYLE_FNDA = 2
150151
const LBT_COMPLEX_RETSTYLE_UNKNOWN = -1
151152
const LBT_CBLAS_CONFORMANT = 0
152153
const LBT_CBLAS_DIVERGENT = 1

0 commit comments

Comments
 (0)