214
214
N = maximum (lengths)
215
215
Dlen = vecunrolllen (D)
216
216
Sreduced = (S > 0 ) && (lengths[S] == - 1 ) && N != - 1
217
- # @show N, M, Sreduced
218
217
if Sreduced
219
218
M = N
220
219
t = q
@@ -262,7 +261,6 @@ function parent_op_name(
262
261
if n == tiledouterreduction
263
262
parent = Symbol (parent, modsuffix)
264
263
else
265
- # parent = variable_name(opp, suffix)
266
264
if parents_u₂syms[n]
267
265
parent = Symbol (parent, suffix_)
268
266
end
@@ -273,16 +271,9 @@ function parent_op_name(
273
271
else
274
272
getu₁forreduct (ls, opp, u₁)
275
273
end
276
- # u = parents_u₁syms[n] ? u₁ : 1
277
274
parent = Symbol (parent, ' _' , u)
278
275
end
279
- # if (tiledouterreduction == -1) && LoopVectorization.names(ls)[ls.unrollspecification[].u₁loopnum] ∈ reduceddependencies(opp)
280
- # u = u₁
281
- # else
282
-
283
- # end
284
276
if opisvectorized && isload (opp) && (! isvectorized (opp))
285
- # @show parents_u₁syms, parents_u₂syms, parent
286
277
parent = Symbol (parent, " ##broadcasted##" )
287
278
end
288
279
parent
@@ -361,31 +352,24 @@ function lower_compute!(
361
352
else
362
353
newpname = Symbol (newparentname, ' _' , u₁)
363
354
push! (q. args, Expr (:(= ), newpname, Symbol (parentname, ' _' , u₁)))
364
- # @show newparentop op instruction(newparentop)
365
355
reduce_expr! (q, newparentname, instruction (newparentop), u₁, - 1 , true )
366
356
push! (q. args, Expr (:(= ), Symbol (newparentname, ' _' , 1 ), Symbol (newparentname, " ##onevec##" )))
367
357
end
368
358
end
369
359
end
370
360
# if suffix === nothing# &&
371
361
# end
372
- # if instr.instr === :div_fast
373
- # @show op, suffix, parents_u₂syms parents(op)
374
- # @show isu₂unrolled.(parents(op))
375
- # end
376
362
# cache unroll and tiling check of parents
377
363
# not broadcasted, because we use frequent checks of individual bools
378
364
# making BitArrays inefficient.
379
365
# parentsyms = [opp.variable for opp ∈ parents(op)]
380
366
Uiter = opunrolled ? u₁ - 1 : 0
381
- # @show mvar, opunrolled, u₁, u₁loopsym, u₂loopsym
382
367
isreduct = isreduction (op)
383
368
if Base. libllvm_version < v " 11.0.0" && (suffix ≠ - 1 ) && isreduct# && (iszero(suffix) || (ls.unrollspecification[].u₂ - 1 == suffix))
384
369
# if (length(reduceddependencies(op)) > 0) | (length(reducedchildren(op)) > 0)# && (iszero(suffix) || (ls.unrollspecification[].u₂ - 1 == suffix))
385
370
# instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
386
371
instrfid = findfirst (Base. Fix2 (=== ,instr. instr), (:vfmadd_fast , :vfnmadd_fast , :vfmsub_fast , :vfnmsub_fast ))
387
372
# instrfid = findfirst(isequal(instr.instr), (:vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast))
388
- # @show isreduct, instrfid, instr.instr sub_fmas(ls, op, ua)
389
373
# want to instcombine when parent load's deps are superset
390
374
# also make sure opp is unrolled
391
375
if ! (instrfid === nothing ) && (opunrolled && u₁ > 1 ) && sub_fmas (ls, op, ua)
@@ -414,6 +398,7 @@ function lower_compute!(
414
398
# for u ∈ 0:Uiter
415
399
isouterreduct = false
416
400
instrcall = callexpr (instr)
401
+ dopartialmap = false
417
402
varsym = if tiledouterreduction > 0 # then suffix ≠ -1
418
403
# modsuffix = ((u + suffix*(Uiter + 1)) & 7)
419
404
isouterreduct = true
@@ -426,28 +411,18 @@ function lower_compute!(
426
411
if isreduct # (isanouterreduction(ls, op))
427
412
# isouterreduct = true
428
413
isouterreduct = isanouterreduction (ls, op)
429
- # @show op, isouterreduct, u₁, ls.unrollspecification[].u₂ != -1
430
- if isouterreduct
431
- Symbol (mvar, ' _' , getu₁full (ls, u₁))
432
- else
433
- Symbol (mvar, ' _' , getu₁forreduct (ls, op, u₁))
434
- end
414
+ u₁reduct = isouterreduct ? getu₁full (ls, u₁) : getu₁forreduct (ls, op, u₁)
415
+ dopartialmap = u₁reduct > u₁
416
+ Symbol (mvar, ' _' , u₁reduct)
435
417
else
436
418
Symbol (mvar, ' _' , u₁)
437
419
end
438
420
else
439
421
Symbol (mvar, ' _' , 1 )
440
422
end
423
+ # @show getu₁forreduct(ls, op, u₁)
441
424
selfopname = varsym
442
- # @show op, tiledouterreduction, isouterreduct
443
- # if name(op) === Symbol("##op#5631")
444
- # @show name(op), parents(op), name.(parents(op))
445
- # parent_name = parent_op_name(parents_op, 1, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
446
- # @show parent_name
447
- # end
448
- # @show selfopname, varsym, mvar, mangledvar(op)
449
425
selfdep = 0
450
- # showexpr = false
451
426
for n ∈ 1 : nparents
452
427
opp = parents_op[n]
453
428
if isloopvalue (opp)
@@ -461,19 +436,19 @@ function lower_compute!(
461
436
selfopname = parent_op_name (ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
462
437
push! (instrcall. args, selfopname)
463
438
else
464
- # @show name(parents_op[n]), name(op), mangledvar(parents_op[n]), mangledvar(op)
439
+ # @show varsym
465
440
push! (instrcall. args, varsym)
466
441
end
467
442
elseif ((! isu₂unrolled (op)) & isu₂unrolled (opp)) && (isouterreduction (ls, opp) != - 1 )
468
443
# this checks if the parent is u₂ unrolled but this operation is not, in which case we need to reduce it.
469
444
push! (instrcall. args, reduce_expr_u₂ (mangledvar (opp), instruction (opp), ureduct (ls)))
470
445
else
471
446
parent = parent_op_name (ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
472
- # @show parent, u₁, selfopname
473
447
push! (instrcall. args, parent)
474
448
end
475
449
end
476
450
selfdepreduce = ifelse (((! u₁unrolledsym) & isu₁unrolled (op)) & (u₁ > 1 ), selfdep, 0 )
451
+ # push!(q.args, (isreduct, u₁, (!u₁unrolledsym), isu₁unrolled(op), dopartialmap, varsym))
477
452
if maskreduct
478
453
ifelsefunc = if ls. unrollspecification[]. u₁ == 1
479
454
:ifelse # don't need to be fancy
@@ -510,9 +485,9 @@ function lower_compute!(
510
485
# @show op, isouterreduct, maskreduct, instr
511
486
make_partial_map! (instrcall, selfopname, u₁, selfdepreduce)
512
487
end
513
- elseif selfdep != 0 &&
488
+ elseif selfdep != 0 && (dopartialmap ||
514
489
(isouterreduct && (opunrolled) && (u₁ < ls. unrollspecification[]. u₁)) ||
515
- (isreduct & (u₁ > 1 ) & (! u₁unrolledsym) & isu₁unrolled (op))
490
+ (isreduct & (u₁ > 1 ) & (! u₁unrolledsym) & isu₁unrolled (op)))
516
491
# first possibility (`isouterreduct && opunrolled && (u₁ < ls.unrollspecification[].u₁)`):
517
492
# checks if we're in the "reduct" part of an outer reduction
518
493
#
@@ -524,7 +499,9 @@ function lower_compute!(
524
499
# elseif
525
500
end
526
501
if instr. instr === :identity && isone (length (parents_op))
527
- push! (q. args, Expr (:(= ), varsym, instrcall. args[2 ]))
502
+ if instrcall. args[2 ] != = varsym
503
+ push! (q. args, Expr (:(= ), varsym, instrcall. args[2 ]))
504
+ end
528
505
elseif identifier (op) ∉ ls. outer_reductions && should_broadcast_op (op)
529
506
push! (q. args, Expr (:(= ), varsym, Expr (:call , lv (:vbroadcast ), VECTORWIDTHSYMBOL, instrcall)))
530
507
else
0 commit comments