|
16 | 16 | #include <limits>
|
17 | 17 | #include <sstream>
|
18 | 18 | #include <string>
|
| 19 | +#include <math.h> |
19 | 20 |
|
20 | 21 | #define CHECK(ID, S, CMP) \
|
21 | 22 | { \
|
@@ -370,6 +371,141 @@ __device__ int shr() {
|
370 | 371 | return 0;
|
371 | 372 | }
|
372 | 373 |
|
| 374 | +template <typename T> |
| 375 | +__device__ T deg2rad(T val) { |
| 376 | + constexpr auto PI = 3.14159265358979323846f; |
| 377 | + return val * PI / 180.0f; |
| 378 | +} |
| 379 | + |
| 380 | +#define FLOAT_CMP(X, Y) ((X - Y) < 1e-4) |
| 381 | +#define POWF2(X) (pow(2.0f, X)) |
| 382 | + |
| 383 | +__device__ int asm_copysign() { |
| 384 | + float f32 = 0.0f; |
| 385 | + double f64 = 0.0; |
| 386 | + CHECK(1, asm("copysign.f32 %0, %1, %2;" : "=f"(f32) : "f"(-10.0f), "f"(100.0f)), FLOAT_CMP(f32, -100.0f)); |
| 387 | + CHECK(2, asm("copysign.f64 %0, %1, %2;" : "=d"(f64) : "d"(-10.0), "d"(100.0)), FLOAT_CMP(f64, -100.0)); |
| 388 | + return 0; |
| 389 | +} |
| 390 | + |
| 391 | +__device__ int asm_cos() { |
| 392 | + float f32 = 0.0f; |
| 393 | + CHECK(1, asm("cos.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(90.0f))), FLOAT_CMP(f32, 0.0f)); |
| 394 | + CHECK(2, asm("cos.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(0.0f))), FLOAT_CMP(f32, 1.0f)); |
| 395 | + CHECK(3, asm("cos.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(60.0f))), FLOAT_CMP(f32, 0.5f)); |
| 396 | + CHECK(4, asm("cos.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(180.0f))), FLOAT_CMP(f32, -1.0f)); |
| 397 | + CHECK(5, asm("cos.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(90.0f))), FLOAT_CMP(f32, 0.0f)); |
| 398 | + CHECK(6, asm("cos.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(0.0f))), FLOAT_CMP(f32, 1.0f)); |
| 399 | + CHECK(7, asm("cos.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(60.0f))), FLOAT_CMP(f32, 0.5f)); |
| 400 | + CHECK(8, asm("cos.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(180.0f))), FLOAT_CMP(f32, -1.0f)); |
| 401 | + return 0; |
| 402 | +} |
| 403 | + |
| 404 | +__device__ int asm_sin() { |
| 405 | + float f32 = 0.0f; |
| 406 | + CHECK(1, asm("sin.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(90.0f))), FLOAT_CMP(f32, 1.0f)); |
| 407 | + CHECK(2, asm("sin.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(0.0f))), FLOAT_CMP(f32, 0.0f)); |
| 408 | + CHECK(3, asm("sin.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(30.0f))), FLOAT_CMP(f32, 0.5f)); |
| 409 | + CHECK(4, asm("sin.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(180.0f))), FLOAT_CMP(f32, 0.0f)); |
| 410 | + CHECK(5, asm("sin.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(90.0f))), FLOAT_CMP(f32, 1.0f)); |
| 411 | + CHECK(6, asm("sin.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(0.0f))), FLOAT_CMP(f32, 0.0f)); |
| 412 | + CHECK(7, asm("sin.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(30.0f))), FLOAT_CMP(f32, 0.5f)); |
| 413 | + CHECK(8, asm("sin.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(deg2rad(180.0f))), FLOAT_CMP(f32, 0.0f)); |
| 414 | + return 0; |
| 415 | +} |
| 416 | + |
| 417 | +__device__ int asm_tanh() { |
| 418 | + float f32 = 0.0f; |
| 419 | + CHECK(1, asm("tanh.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(45.0f))), FLOAT_CMP(f32, tanh(deg2rad(45.0f)))); |
| 420 | + CHECK(2, asm("tanh.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(0.0f))), FLOAT_CMP(f32, tanh(deg2rad(0.0f)))); |
| 421 | + CHECK(3, asm("tanh.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(180.0f))), FLOAT_CMP(f32, tanh(deg2rad(180.0f)))); |
| 422 | + CHECK(4, asm("tanh.approx.f32 %0, %1;" : "=f"(f32) : "f"(deg2rad(90.0f))), FLOAT_CMP(f32, tanh(deg2rad(90.0f)))); |
| 423 | + return 0; |
| 424 | +} |
| 425 | + |
| 426 | +__device__ int asm_ex2() { |
| 427 | + float f32 = 0.0f; |
| 428 | + CHECK(1, asm("ex2.approx.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, POWF2(2.1f))); |
| 429 | + CHECK(2, asm("ex2.approx.f32 %0, %1;" : "=f"(f32) : "f"(3.4f)), FLOAT_CMP(f32, POWF2(3.4f))); |
| 430 | + CHECK(3, asm("ex2.approx.f32 %0, %1;" : "=f"(f32) : "f"(9.7f)), FLOAT_CMP(f32, POWF2(9.7f))); |
| 431 | + CHECK(4, asm("ex2.approx.f32 %0, %1;" : "=f"(f32) : "f"(6.4f)), FLOAT_CMP(f32, POWF2(6.4f))); |
| 432 | + CHECK(5, asm("ex2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, POWF2(2.1f))); |
| 433 | + CHECK(6, asm("ex2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(3.4f)), FLOAT_CMP(f32, POWF2(3.4f))); |
| 434 | + CHECK(7, asm("ex2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(9.7f)), FLOAT_CMP(f32, POWF2(9.7f))); |
| 435 | + CHECK(8, asm("ex2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(6.4f)), FLOAT_CMP(f32, POWF2(6.4f))); |
| 436 | + return 0; |
| 437 | +} |
| 438 | + |
| 439 | +__device__ int asm_lg2() { |
| 440 | + float f32 = 0.0f; |
| 441 | + CHECK(1, asm("lg2.approx.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, log2(2.1f))); |
| 442 | + CHECK(2, asm("lg2.approx.f32 %0, %1;" : "=f"(f32) : "f"(3.4f)), FLOAT_CMP(f32, log2(3.4f))); |
| 443 | + CHECK(3, asm("lg2.approx.f32 %0, %1;" : "=f"(f32) : "f"(9.7f)), FLOAT_CMP(f32, log2(9.7f))); |
| 444 | + CHECK(4, asm("lg2.approx.f32 %0, %1;" : "=f"(f32) : "f"(6.4f)), FLOAT_CMP(f32, log2(6.4f))); |
| 445 | + CHECK(5, asm("lg2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, log2(2.1f))); |
| 446 | + CHECK(6, asm("lg2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(3.4f)), FLOAT_CMP(f32, log2(3.4f))); |
| 447 | + CHECK(7, asm("lg2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(9.7f)), FLOAT_CMP(f32, log2(9.7f))); |
| 448 | + CHECK(8, asm("lg2.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(6.4f)), FLOAT_CMP(f32, log2(6.4f))); |
| 449 | + return 0; |
| 450 | +} |
| 451 | + |
| 452 | +__device__ int sad() { |
| 453 | + int16_t s16; |
| 454 | + uint16_t u16; |
| 455 | + int32_t s32; |
| 456 | + uint32_t u32; |
| 457 | + int64_t s64; |
| 458 | + uint64_t u64; |
| 459 | + CHECK(1, asm("sad.s16 %0, %1, %2, %3;" : "=h"(s16) : "h"((int16_t)-1), "h"((int16_t)3), "h"((int16_t)5)), s16 == 9); |
| 460 | + CHECK(2, asm("sad.u16 %0, %1, %2, %3;" : "=h"(u16) : "h"((int16_t)1), "h"((int16_t)3), "h"((int16_t)5)), u16 == 7); |
| 461 | + CHECK(3, asm("sad.s32 %0, %1, %2, %3;" : "=r"(s32) : "r"(-1), "r"(3), "r"(5)), s32 == 9); |
| 462 | + CHECK(4, asm("sad.u32 %0, %1, %2, %3;" : "=r"(u32) : "r"(1), "r"(3), "r"(5)), u32 == 7); |
| 463 | + CHECK(5, asm("sad.s64 %0, %1, %2, %3;" : "=l"(s64) : "l"(-1ll), "l"(3ll), "l"(5ll)), s64 == 9); |
| 464 | + CHECK(6, asm("sad.u64 %0, %1, %2, %3;" : "=l"(u64) : "l"(1ll), "l"(3ll), "l"(5ll)), u64 == 7); |
| 465 | + return 0; |
| 466 | +} |
| 467 | + |
| 468 | +__device__ int asm_rsqrt() { |
| 469 | + float f32; |
| 470 | + double f64; |
| 471 | + CHECK(1, asm("rsqrt.approx.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, rsqrt(2.1f))); |
| 472 | + CHECK(2, asm("rsqrt.approx.f64 %0, %1;" : "=d"(f64) : "d"(2.1)), FLOAT_CMP(f64, rsqrt(2.1))); |
| 473 | + return 0; |
| 474 | +} |
| 475 | + |
| 476 | +__device__ int asm_sqrt() { |
| 477 | + float f32; |
| 478 | + double f64; |
| 479 | + CHECK(1, asm("sqrt.approx.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 480 | + CHECK(2, asm("sqrt.approx.f32.ftz %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 481 | + CHECK(3, asm("sqrt.rn.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 482 | + CHECK(4, asm("sqrt.rz.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 483 | + CHECK(5, asm("sqrt.rm.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 484 | + CHECK(6, asm("sqrt.rp.f32 %0, %1;" : "=f"(f32) : "f"(2.1f)), FLOAT_CMP(f32, sqrt(2.1f))); |
| 485 | + CHECK(7, asm("sqrt.rn.f64 %0, %1;" : "=d"(f64) : "d"(2.1)), FLOAT_CMP(f64, sqrt(2.1))); |
| 486 | + CHECK(8, asm("sqrt.rz.f64 %0, %1;" : "=d"(f64) : "d"(2.1)), FLOAT_CMP(f64, sqrt(2.1))); |
| 487 | + CHECK(9, asm("sqrt.rm.f64 %0, %1;" : "=d"(f64) : "d"(2.1)), FLOAT_CMP(f64, sqrt(2.1))); |
| 488 | + CHECK(10, asm("sqrt.rp.f64 %0, %1;" : "=d"(f64) : "d"(2.1)), FLOAT_CMP(f64, sqrt(2.1))); |
| 489 | + return 0; |
| 490 | +} |
| 491 | + |
| 492 | +__device__ int testp() { |
| 493 | + int pred = 0; |
| 494 | + { asm(".reg .pred p1; testp.finite.f32 p1, %1; @p1 mov.s32 %0, 1;" : "=r"(pred) : "f"(0.1f)); if (!pred) { return 1; } }; |
| 495 | + { asm(".reg .pred p2; testp.infinite.f32 p2, %1; @p2 mov.s32 %0, 1;" : "=r"(pred) : "f"(std::numeric_limits<float>::infinity())); if (!pred) { return 2; } }; |
| 496 | + { asm(".reg .pred p3; testp.number.f32 p3, %1; @p3 mov.s32 %0, 1;" : "=r"(pred) : "f"(9.7f)); if (!pred) { return 3; } }; |
| 497 | + { asm(".reg .pred p4; testp.notanumber.f32 p4, %1; @p4 mov.s32 %0, 1;" : "=r"(pred) : "f"(NAN)); if (!pred) { return 4; } }; |
| 498 | + { asm(".reg .pred p5; testp.normal.f32 p5, %1; @p5 mov.s32 %0, 1;" : "=r"(pred) : "f"(9.5f)); if (!pred) { return 5; } }; |
| 499 | + { asm(".reg .pred p6; testp.subnormal.f32 p6, %1; @p6 mov.s32 %0, 1;" : "=r"(pred) : "f"(0.1e-300f)); if (!pred) { return 6; } }; |
| 500 | + { asm(".reg .pred p7; testp.finite.f64 p7, %1; @p7 mov.s32 %0, 1;" : "=r"(pred) : "d"(0.1)); if (!pred) { return 1; } }; |
| 501 | + { asm(".reg .pred p8; testp.infinite.f64 p8, %1; @p8 mov.s32 %0, 1;" : "=r"(pred) : "d"(std::numeric_limits<double>::infinity())); if (!pred) { return 2; } }; |
| 502 | + { asm(".reg .pred p9; testp.number.f64 p9, %1; @p9 mov.s32 %0, 1;" : "=r"(pred) : "d"(9.7)); if (!pred) { return 3; } }; |
| 503 | + { asm(".reg .pred p10; testp.notanumber.f64 p10, %1; @p10 mov.s32 %0, 1;" : "=r"(pred) : "d"(double(NAN))); if (!pred) { return 4; } }; |
| 504 | + { asm(".reg .pred p11; testp.normal.f64 p11, %1; @p11 mov.s32 %0, 1;" : "=r"(pred) : "d"(9.5)); if (!pred) { return 5; } }; |
| 505 | + { asm(".reg .pred p12; testp.subnormal.f64 p12, %1; @p12 mov.s32 %0, 1;" : "=r"(pred) : "d"(0.1e-400)); if (!pred) { return 6; } }; |
| 506 | + return 0; |
| 507 | +} |
| 508 | + |
373 | 509 | __device__ int dp2a() {
|
374 | 510 | int32_t i32;
|
375 | 511 | uint32_t u32;
|
@@ -431,6 +567,16 @@ __global__ void test(int *ec) {
|
431 | 567 | TEST(cnot);
|
432 | 568 | TEST(shl);
|
433 | 569 | TEST(shr);
|
| 570 | + TEST(asm_copysign); |
| 571 | + TEST(asm_cos); |
| 572 | + TEST(asm_sin); |
| 573 | + TEST(asm_tanh); |
| 574 | + TEST(asm_ex2); |
| 575 | + TEST(asm_lg2); |
| 576 | + TEST(sad); |
| 577 | + TEST(asm_rsqrt); |
| 578 | + TEST(asm_sqrt); |
| 579 | + TEST(testp); |
434 | 580 | TEST(brev);
|
435 | 581 | TEST(dp2a);
|
436 | 582 | TEST(dp4a);
|
|
0 commit comments