Skip to content
Open
50 changes: 50 additions & 0 deletions src/hotspot/share/opto/countbitsnode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "opto/opcodes.hpp"
#include "opto/phaseX.hpp"
#include "opto/type.hpp"
#include "utilities/population_count.hpp"

//------------------------------Value------------------------------------------
const Type* CountLeadingZerosINode::Value(PhaseGVN* phase) const {
Expand Down Expand Up @@ -116,3 +117,52 @@ const Type* CountTrailingZerosLNode::Value(PhaseGVN* phase) const {
}
return TypeInt::INT;
}

/*
Lemma 1: For a given known bits information, _lo and _hi bounds of the corresponding value
range is computed using the following formulas:-
- _hi = ~ZEROS
- _lo = ONES
Proof:-
- KnownBits.ZEROS and KnownBits.ONES are inferred out of the common prefix of the value range
delimiting bounds.

- Thus, ~KnownBits.ZEROS not only includes set bits in the common prefix but also optimistically assumes
that all other bits not included in the common prefix are also set.

- Consider the following illustration, which performs round-trip translation
of a value range via knowbits information, e.g.
A) Initial value range bounds to infer knownbits.
_lo = 0b11000100
_hi = 0b11000110
_common_prefix = 0b11000100
_common_prefix_mask = 0b11111100
_known_bits.ones = _lo & _common_prefix_mask = 0b11000100
_known_bits.zeros = ~_lo & _common_prefix_mask = 0b00111000

B) Now, transform the computed knownbits back to the value range.
_new_lo = _known_bits.ones = 0b11000100
_new_hi = ~known_bits.zeros = 0b11000111

- We now know that ~KnownBits.ZEROS >= UB >= LB >= KnownBits.ONES
- Therefore, popcount(ONES) and popcount(~ZEROS) can safely be assumed as the upper and lower
bounds of the result value range.
*/
const Type* PopCountINode::Value(PhaseGVN* phase) const {
const Type* t = phase->type(in(1));
if (t == Type::TOP) {
return Type::TOP;
}
KnownBits<juint> bits = t->isa_int()->_bits;
return TypeInt::make(population_count(bits._ones), population_count(~bits._zeros), Type::WidenMax);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The widen of the output should be the same as the widen of the input, not WidenMax here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @merykitty, widen is mainly used for optimistic data flow analysis pass like CCP where type analysis begins with TOP and progressively grows the value range till convergence / fixed point.
it's good to preserve the widen of input to delay eager convergence.


}

const Type* PopCountLNode::Value(PhaseGVN* phase) const {
const Type* t = phase->type(in(1));
if (t == Type::TOP) {
return Type::TOP;
}
KnownBits<julong> bits = t->isa_long()->_bits;
return TypeInt::make(population_count(bits._ones), population_count(~bits._zeros), Type::WidenMax);
}
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/countbitsnode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class PopCountINode : public CountBitsNode {
public:
PopCountINode(Node* in1) : CountBitsNode(in1) {}
virtual int Opcode() const;
virtual const Type* Value(PhaseGVN* phase) const;
};

//---------- PopCountLNode -----------------------------------------------------
Expand All @@ -88,6 +89,7 @@ class PopCountLNode : public CountBitsNode {
public:
PopCountLNode(Node* in1) : CountBitsNode(in1) {}
virtual int Opcode() const;
virtual const Type* Value(PhaseGVN* phase) const;
};


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/**
* @test
* @bug 8365205
* @summary C2: Optimize popcount value computation using knownbits
* @library /test/lib /
* @run driver compiler.intrinsics.TestPopCountValueTransforms
*/
package compiler.intrinsics;

import compiler.lib.ir_framework.*;
import compiler.lib.generators.*;
import compiler.lib.verify.*;
import static compiler.lib.generators.Generators.*;
import jdk.test.lib.Utils;

public class TestPopCountValueTransforms {
int [] inI1;
int [] inI2;
long [] inL1;
long [] inL2;

@Test
@IR(counts = {IRNode.POPCOUNT_L, " 0 "})
public long testPopCountElisionLong1(long num) {
num = Math.clamp(num, 0xF000F000L, 0xF000F0FFL);
// PopCount ValueRange = {lo:8, hi:16}
if (Long.bitCount(num) < 8 || Long.bitCount(num) > 16) {
return 0;
}
return 1;
}

@Run(test = {"testPopCountElisionLong1"}, mode = RunMode.STANDALONE)
public void runPopCountElisionLong1() {
long res = 1;
for (int i = 0; i < inL1.length; i++) {
res &= testPopCountElisionLong1(inL1[i]);
}
Verify.checkEQ(res, 1L);
}

@Test
@IR(counts = {IRNode.POPCOUNT_L, " >0 "})
public long testPopCountElisionLong2(long num) {
num = Math.clamp(num, 0x3L, 0xFFFFL);
// PopCount ValueRange = {lo:0, hi:16}
if (Long.bitCount(num) >= 0 && Long.bitCount(num) <= 11) {
return 0;
}
return 1;
}

@Run(test = {"testPopCountElisionLong2"}, mode = RunMode.STANDALONE)
public void runPopCountElisionLong2() {
long res = 0;
for (int i = 0; i < inL2.length; i++) {
res |= testPopCountElisionLong2(inL2[i]);
}
Verify.checkEQ(res, 0L);
}

@Test
@IR(counts = {IRNode.POPCOUNT_I, " 0 "})
public int testPopCountElisionInt1(int num) {
// PopCount ValueRange = {lo:11, hi:15}
num = Math.clamp(num, 0xFE00F000, 0xFE00F00F);
if (Integer.bitCount(num) < 11 || Integer.bitCount(num) > 15) {
return 0;
}
return 1;
}

@Run(test = {"testPopCountElisionInt1"}, mode = RunMode.STANDALONE)
public void runPopCountElisionInt1() {
int res = 1;
for (int i = 0; i < inI1.length; i++) {
res &= testPopCountElisionInt1(inI1[i]);
}
Verify.checkEQ(res, 1);
}

@Test
@IR(counts = {IRNode.POPCOUNT_I, " >0 "})
public int testPopCountElisionInt2(int num) {
// PopCount ValueRange = {lo:0, hi:8}
num = Math.clamp(num, 0x3, 0xFF);
if (Integer.bitCount(num) >= 0 && Integer.bitCount(num) <= 5) {
return 0;
}
return 1;
}

@Run(test = {"testPopCountElisionInt2"}, mode = RunMode.STANDALONE)
public void runPopCountElisionInt2() {
int res = 0;
for (int i = 0; i < inI2.length; i++) {
res |= testPopCountElisionInt2(inI2[i]);
}
Verify.checkEQ(res, 0);
}

static final int SIZE = 4096;

public TestPopCountValueTransforms() {
inL1 = new long[SIZE];
G.fill(G.longs(), inL1);

inL2 = new long[SIZE];
Generator<Long> genL = G.uniformLongs(0x3L, 0xFFCL);
for (int i = 0; i < SIZE; i++) {
inL2[i] = genL.next();
}

inI1 = new int[SIZE];
G.fill(G.ints(), inI1);

inI2 = new int[SIZE];
Generator<Integer> genI = G.uniformInts(0x3, 0x1F);
for (int i = 0; i < SIZE; i++) {
inI2[i] = genI.next();
}
}

public static void main(String[] args) {
TestFramework.runWithFlags("-XX:-TieredCompilation", "-XX:CompileThresholdScaling=0.2");
}
}
5 changes: 5 additions & 0 deletions test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -1610,6 +1610,11 @@ public class IRNode {
beforeMatchingNameRegex(PHI, "Phi");
}

public static final String POPCOUNT_I = PREFIX + "POPCOUNT_I" + POSTFIX;
static {
beforeMatchingNameRegex(POPCOUNT_I, "PopCountI");
}

public static final String POPCOUNT_L = PREFIX + "POPCOUNT_L" + POSTFIX;
static {
beforeMatchingNameRegex(POPCOUNT_L, "PopCountL");
Expand Down
80 changes: 80 additions & 0 deletions test/micro/org/openjdk/bench/java/lang/PopCountValueTransform.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;

import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;

@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
public class PopCountValueTransform {
public int lower_bound = 0;
public int upper_bound = 10000;

@Benchmark
public int StockKernelInt() {
int res = 0;
for (int i = lower_bound; i < upper_bound; i++) {
int constrained_i = i & 0xFFFF;
res += constrained_i;
}
return res;
}

@Benchmark
public int LogicFoldingKerenlInt() {
int res = 0;
for (int i = lower_bound; i < upper_bound; i++) {
int constrained_i = i & 0xFFFF;
if (Integer.bitCount(constrained_i) > 16) {
throw new AssertionError("Uncommon trap");
}
res += constrained_i;
}
return res;
}

@Benchmark
public long StockKernelLong() {
long res = 0;
for (int i = lower_bound; i < upper_bound; i++) {
long constrained_i = i & 0xFFFFFFL;
res += constrained_i;
}
return res;
}

@Benchmark
public long LogicFoldingKerenLong() {
long res = 0;
for (int i = lower_bound; i < upper_bound; i++) {
long constrained_i = i & 0xFFFFFFL;
if (Long.bitCount(constrained_i) > 24) {
throw new AssertionError("Uncommon trap");
}
res += constrained_i;
}
return res;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the stock kernels are there to show performance if there is no op, the folding kernels you hope have the same performance. It would be nice to have one where the bitCount does not fold away, just to keep that comparison :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, on a second thought, since any benchmarks compare the performance of kernels with and without optimization it's better to do away with the stock variants and only retain folding kernels.

}