@@ -318,10 +318,11 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
318
318
void
319
319
mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
320
320
{
321
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
322
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
323
+
321
324
if (IS_BINARY_CONT(@type@, @type@)) {
322
- if (dimensions[0] > VML_ASM_THRESHOLD &&
323
- DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
324
- DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
325
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
325
326
CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]);
326
327
/* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
327
328
}
@@ -371,8 +372,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
371
372
}
372
373
}
373
374
else if (IS_BINARY_CONT_S1(@type@, @type@)) {
374
- if (dimensions[0] > VML_ASM_THRESHOLD &&
375
- DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
375
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) {
376
376
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], @
[email protected] , *(@type@*)args[0], 0.0, 1.0);
377
377
/* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], @
[email protected] , *(@type@*)args[0], 0.0, 1.0, (@type@*) args[2]); */
378
378
}
@@ -412,8 +412,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
412
412
}
413
413
}
414
414
else if (IS_BINARY_CONT_S2(@type@, @type@)) {
415
- if (dimensions[0] > VML_ASM_THRESHOLD &&
416
- DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) {
415
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) {
417
416
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0);
418
417
/* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], 1.0, @OP@(*(@type@*)args[1]), 0.0, 1.0, (@type@*) args[2]); */
419
418
}
@@ -478,10 +477,11 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
478
477
void
479
478
mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
480
479
{
480
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
481
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
482
+
481
483
if (IS_BINARY_CONT(@type@, @type@)) {
482
- if (dimensions[0] > VML_ASM_THRESHOLD &&
483
- DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
484
- DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
484
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
485
485
CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]);
486
486
/* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
487
487
}
@@ -531,8 +531,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
531
531
}
532
532
}
533
533
else if (IS_BINARY_CONT_S1(@type@, @type@)) {
534
- if (dimensions[0] > VML_ASM_THRESHOLD &&
535
- DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
534
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same2) {
536
535
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[1], args[2], *(@type@*)args[0], 0.0, 0.0, 1.0);
537
536
/* v@s@LinearFrac(dimensions[0], (@type@*) args[1], (@type@*) args[1], *(@type@*)args[0], 0.0, 0.0, 1.0, (@type@*) args[2]); */
538
537
}
@@ -572,8 +571,7 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
572
571
}
573
572
}
574
573
else if (IS_BINARY_CONT_S2(@type@, @type@)) {
575
- if (dimensions[0] > VML_ASM_THRESHOLD &&
576
- DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@))) {
574
+ if (dimensions[0] > VML_ASM_THRESHOLD && disjoint_or_same1) {
577
575
CHUNKED_VML_LINEARFRAC_CALL(v@s@LinearFrac, dimensions[0], @type@, args[0], args[2], *(@type@*)args[1], 0.0, 0.0, 1.0);
578
576
/* v@s@LinearFrac(dimensions[0], (@type@*) args[0], (@type@*) args[0], *(@type@*)args[1], 0.0, 0.0, 1.0, (@type@*) args[2]); */
579
577
}
@@ -630,10 +628,11 @@ mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_int
630
628
void
631
629
mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
632
630
{
631
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
632
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
633
+
633
634
if (IS_BINARY_CONT(@type@, @type@)) {
634
- if (dimensions[0] > VML_D_THRESHOLD &&
635
- DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@)) &&
636
- DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@))) {
635
+ if (dimensions[0] > VML_D_THRESHOLD && disjoint_or_same1 && disjoint_or_same2) {
637
636
CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]);
638
637
/* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
639
638
}
@@ -1365,83 +1364,114 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, npy_intp st
1365
1364
}
1366
1365
}
1367
1366
1368
- /* TODO: USE MKL */
1369
1367
/**begin repeat1
1370
1368
* #kind = add, subtract#
1371
1369
* #OP = +, -#
1372
1370
* #PW = 1, 0#
1371
+ * #VML = Add, Sub#
1373
1372
*/
1374
1373
void
1375
1374
mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
1376
1375
{
1377
- if (IS_BINARY_REDUCE && @PW@) {
1378
- npy_intp n = dimensions[0];
1379
- @ftype@ * or = ((@ftype@ *)args[0]);
1380
- @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
1381
- @ftype@ rr, ri;
1376
+ const int contig = IS_BINARY_CONT(@type@, @type@);
1377
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
1378
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
1379
+ const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;
1382
1380
1383
- pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
1384
- *or @OP@= rr;
1385
- *oi @OP@= ri;
1386
- return;
1381
+ if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) {
1382
+ CHUNKED_VML_CALL3(v@s@@VML@, dimensions[0], @type@, args[0], args[1], args[2]);
1383
+ /* v@s@@VML@(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
1387
1384
}
1388
- else {
1389
- BINARY_LOOP {
1390
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1391
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1392
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1393
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1394
- ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
1395
- ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
1385
+ else {
1386
+ if (IS_BINARY_REDUCE && @PW@) {
1387
+ npy_intp n = dimensions[0];
1388
+ @ftype@ * or = ((@ftype@ *)args[0]);
1389
+ @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
1390
+ @ftype@ rr, ri;
1391
+
1392
+ pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
1393
+ *or @OP@= rr;
1394
+ *oi @OP@= ri;
1395
+ return;
1396
+ }
1397
+ else {
1398
+ BINARY_LOOP {
1399
+ const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1400
+ const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1401
+ const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1402
+ const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1403
+ ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
1404
+ ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
1405
+ }
1396
1406
}
1397
1407
}
1398
1408
}
1399
1409
/**end repeat1**/
1400
1410
1401
- /* TODO: USE MKL */
1402
1411
void
1403
1412
mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
1404
1413
{
1405
- BINARY_LOOP {
1406
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1407
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1408
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1409
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1410
- ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
1411
- ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
1414
+ const int contig = IS_BINARY_CONT(@type@, @type@);
1415
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
1416
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
1417
+ const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;
1418
+
1419
+ if (can_vectorize && dimensions[0] > VML_ASM_THRESHOLD) {
1420
+ CHUNKED_VML_CALL3(v@s@Mul, dimensions[0], @type@, args[0], args[1], args[2]);
1421
+ /* v@s@Mul(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
1422
+ }
1423
+ else {
1424
+ BINARY_LOOP {
1425
+ const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1426
+ const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1427
+ const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1428
+ const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1429
+ ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
1430
+ ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
1431
+ }
1412
1432
}
1413
1433
}
1414
1434
1415
- /* TODO: USE MKL */
1416
1435
void
1417
1436
mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
1418
1437
{
1419
- BINARY_LOOP {
1420
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1421
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1422
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1423
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1424
- const @ftype@ in2r_abs = fabs@c@(in2r);
1425
- const @ftype@ in2i_abs = fabs@c@(in2i);
1426
- if (in2r_abs >= in2i_abs) {
1427
- if (in2r_abs == 0 && in2i_abs == 0) {
1428
- /* divide by zero should yield a complex inf or nan */
1429
- ((@ftype@ *)op1)[0] = in1r/in2r_abs;
1430
- ((@ftype@ *)op1)[1] = in1i/in2i_abs;
1438
+ const int contig = IS_BINARY_CONT(@type@, @type@);
1439
+ const int disjoint_or_same1 = DISJOINT_OR_SAME(args[0], args[2], dimensions[0], sizeof(@type@));
1440
+ const int disjoint_or_same2 = DISJOINT_OR_SAME(args[1], args[2], dimensions[0], sizeof(@type@));
1441
+ const int can_vectorize = contig && disjoint_or_same1 && disjoint_or_same2;
1442
+
1443
+ if (can_vectorize && dimensions[0] > VML_D_THRESHOLD) {
1444
+ CHUNKED_VML_CALL3(v@s@Div, dimensions[0], @type@, args[0], args[1], args[2]);
1445
+ /* v@s@Div(dimensions[0], (@type@*) args[0], (@type@*) args[1], (@type@*) args[2]); */
1446
+ }
1447
+ else {
1448
+ BINARY_LOOP {
1449
+ const @ftype@ in1r = ((@ftype@ *)ip1)[0];
1450
+ const @ftype@ in1i = ((@ftype@ *)ip1)[1];
1451
+ const @ftype@ in2r = ((@ftype@ *)ip2)[0];
1452
+ const @ftype@ in2i = ((@ftype@ *)ip2)[1];
1453
+ const @ftype@ in2r_abs = fabs@c@(in2r);
1454
+ const @ftype@ in2i_abs = fabs@c@(in2i);
1455
+ if (in2r_abs >= in2i_abs) {
1456
+ if (in2r_abs == 0 && in2i_abs == 0) {
1457
+ /* divide by zero should yield a complex inf or nan */
1458
+ ((@ftype@ *)op1)[0] = in1r/in2r_abs;
1459
+ ((@ftype@ *)op1)[1] = in1i/in2i_abs;
1460
+ }
1461
+ else {
1462
+ const @ftype@ rat = in2i/in2r;
1463
+ const @ftype@ scl = 1.0@c@/(in2r + in2i*rat);
1464
+ ((@ftype@ *)op1)[0] = (in1r + in1i*rat)*scl;
1465
+ ((@ftype@ *)op1)[1] = (in1i - in1r*rat)*scl;
1466
+ }
1431
1467
}
1432
1468
else {
1433
- const @ftype@ rat = in2i/ in2r;
1434
- const @ftype@ scl = 1.0@c@/(in2r + in2i *rat);
1435
- ((@ftype@ *)op1)[0] = (in1r + in1i*rat )*scl;
1436
- ((@ftype@ *)op1)[1] = (in1i - in1r*rat )*scl;
1469
+ const @ftype@ rat = in2r/in2i ;
1470
+ const @ftype@ scl = 1.0@c@/(in2i + in2r *rat);
1471
+ ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
1472
+ ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
1437
1473
}
1438
1474
}
1439
- else {
1440
- const @ftype@ rat = in2r/in2i;
1441
- const @ftype@ scl = 1.0@c@/(in2i + in2r*rat);
1442
- ((@ftype@ *)op1)[0] = (in1r*rat + in1i)*scl;
1443
- ((@ftype@ *)op1)[1] = (in1i*rat - in1r)*scl;
1444
- }
1445
1475
}
1446
1476
}
1447
1477
0 commit comments