Skip to content

Commit 56e75fc

Browse files
Merge pull request #162 from LLNL/release/2.2.2
Release v2.2.2
2 parents c912f58 + c2b23e9 commit 56e75fc

16 files changed

+904
-305
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cmake_policy(SET CMP0057 NEW)
88
cmake_policy(SET CMP0048 NEW)
99

10-
project(Chai LANGUAGES CXX VERSION 2.2.1)
10+
project(Chai LANGUAGES CXX VERSION 2.2.2)
1111

1212
set(ENABLE_CUDA Off CACHE BOOL "Enable CUDA")
1313
set(ENABLE_HIP Off CACHE BOOL "Enable HIP")

docs/sphinx/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
# The short X.Y version.
6464
version = u'2.2'
6565
# The full version, including alpha/beta/rc tags.
66-
release = u'2.2.1'
66+
release = u'2.2.2'
6767

6868
# The language for content autogenerated by Sphinx. Refer to documentation
6969
# for a list of supported languages.

docs/sphinx/conf.py.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ author = u''
6262
# The short X.Y version.
6363
version = u'2.2'
6464
# The full version, including alpha/beta/rc tags.
65-
release = u'2.2.1'
65+
release = u'2.2.2'
6666

6767
# The language for content autogenerated by Sphinx. Refer to documentation
6868
# for a list of supported languages.

docs/sphinx/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Any questions? Contact chai-dev@llnl.gov
6363

6464
getting_started
6565
tutorial
66+
user_guide
6667

6768
.. toctree::
6869
:maxdepth: 2

docs/sphinx/user_guide.rst

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
.. Copyright (c) 2016, Lawrence Livermore National Security, LLC. All
2+
rights reserved.
3+
4+
Produced at the Lawrence Livermore National Laboratory
5+
6+
This file is part of CHAI.
7+
8+
LLNL-CODE-705877
9+
10+
For details, see https:://github.com/LLNL/CHAI
11+
Please also see the NOTICE and LICENSE files.
12+
13+
Redistribution and use in source and binary forms, with or without
14+
modification, are permitted provided that the following conditions
15+
are met:
16+
17+
- Redistributions of source code must retain the above copyright
18+
notice, this list of conditions and the following disclaimer.
19+
20+
- Redistributions in binary form must reproduce the above copyright
21+
notice, this list of conditions and the following disclaimer in the
22+
documentation and/or other materials provided with the
23+
distribution.
24+
25+
- Neither the name of the LLNS/LLNL nor the names of its contributors
26+
may be used to endorse or promote products derived from this
27+
software without specific prior written permission.
28+
29+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
36+
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
37+
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38+
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
39+
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40+
POSSIBILITY OF SUCH DAMAGE.
41+
42+
.. _user_guide:
43+
44+
**********
45+
User Guide
46+
**********
47+
48+
-----------------------------------
49+
A Portable Pattern for Polymorphism
50+
-----------------------------------
51+
52+
CHAI provides a data structure to help handle cases where it is desirable to call virtual functions on the device. If you only call virtual functions on the host, this pattern is unnecessary. But for those who do want to use virtual functions on the device without a painstaking amount of refactoring, we begin with a short, albeit admittedly contrived example.
53+
54+
.. code-block:: cpp
55+
56+
class MyBaseClass {
57+
public:
58+
MyBaseClass() {}
59+
virtual ~MyBaseClass() {}
60+
virtual int getValue() const = 0;
61+
};
62+
63+
class MyDerivedClass : public MyBaseClass {
64+
public:
65+
MyDerivedClass(int value) : MyBaseClass(), m_value(value) {}
66+
~MyDerivedClass() {}
67+
int getValue() const { return m_value; }
68+
69+
private:
70+
int m_value;
71+
};
72+
73+
int main(int argc, char** argv) {
74+
MyBaseClass* myBaseClass = new MyDerivedClass(0);
75+
myBaseClass->getValue();
76+
delete myBaseClass;
77+
return 0;
78+
}
79+
80+
It is perfectly fine to call `myBaseClass->getValue()` in host code, since `myBaseClass` was created on the host. However, what if you want to call this virtual function on the device?
81+
82+
.. code-block:: cpp
83+
84+
__global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
85+
myBaseClass->getValue();
86+
}
87+
88+
int main(int argc, char** argv) {
89+
MyBaseClass* myBaseClass = new MyDerivedClass(0);
90+
callVirtualFunction<<<1, 1>>>(myBaseClass);
91+
delete myBaseClass;
92+
return 0;
93+
}
94+
95+
At best, calling this code will result in a crash. At worst, it will access garbage and happily continue while giving incorrect results. It is illegal to access host pointers on the device and produces undefined behavior. So what is our next attempt? Why not pass the argument by value rather than by a pointer?
96+
97+
.. code-block:: cpp
98+
99+
__global__ void callVirtualFunction(MyBaseClass myBaseClass) {
100+
myBaseClass.getValue();
101+
}
102+
103+
int main(int argc, char** argv) {
104+
MyBaseClass* myBaseClass = new MyDerivedClass(0);
105+
callVirtualFunction<<<1, 1>>>(*myBaseClass); // This will not compile
106+
delete myBaseClass;
107+
return 0;
108+
}
109+
110+
At first glance, this may seem like it would work, but this is not supported by nvidia: "It is not allowed to pass as an argument to a `__global__` function an object of a class with virtual functions" (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-functions). Also: "It is not allowed to pass as an argument to a `__global__` function an object of a class derived from virtual base classes" (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-base-classes). You could refactor to use the curiously recurring template pattern, but that would likely require a large development effort and also limits the programming patterns you can use. Also, there is a limitation on the size of the arguments passed to a global kernel, so if you have a very large class this is simply impossible. So we make another attempt.
111+
112+
.. code-block:: cpp
113+
114+
__global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
115+
myBaseClass->getValue();
116+
}
117+
118+
int main(int argc, char** argv) {
119+
MyBaseClass* myBaseClass = new MyDerivedClass(0);
120+
MyBaseClass* d_myBaseClass;
121+
cudaMalloc(&d_myBaseClass, sizeof(MyBaseClass));
122+
cudaMemcpy(d_myBaseClass, myBaseClass, sizeof(MyBaseClass), cudaMemcpyHostToDevice);
123+
124+
callVirtualFunction<<<1, 1>>>(d_myBaseClass);
125+
126+
cudaFree(d_myBaseClass);
127+
delete myBaseClass;
128+
129+
return 0;
130+
}
131+
132+
We are getting nearer, but there is still a flaw. The bits of `myBaseClass` contain the virtual function table that allows virtual function lookups on the host, but that virtual function table is not valid for lookups on the device since it contains pointers to host functions. It will not work any better to cast to `MyDerivedClass` and copy the bits. The only option is to call the constructor on the device and keep that device pointer around.
133+
134+
.. code-block:: cpp
135+
136+
__global__ void make_on_device(MyBaseClass** myBaseClass, int argument) {
137+
*myBaseClass = new MyDerivedClass(argument);
138+
}
139+
140+
__global__ void destroy_on_device(MyBaseClass* myBaseClass) {
141+
delete myBaseClass;
142+
}
143+
144+
__global__ void callVirtualFunction(MyBaseClass* myBaseClass) {
145+
myBaseClass->getValue();
146+
}
147+
148+
int main(int argc, char** argv) {
149+
MyBaseClass** d_temp;
150+
cudaMalloc(&d_temp, sizeof(MyBaseClass*));
151+
make_on_device<<<1, 1>>>(d_temp, 0);
152+
153+
MyBaseClass** temp = (MyBaseClass**) malloc(sizeof(MyBaseClass*));
154+
cudaMemcpy(temp, d_temp, sizeof(MyBaseClass*), cudaMemcpyDeviceToHost);
155+
MyBaseClass d_myBaseClass = *temp;
156+
157+
callVirtualFunction<<<1, 1>>>(d_myBaseClass);
158+
159+
free(temp);
160+
destroy_on_device<<<1, 1>>>(d_myBaseClass);
161+
cudaFree(d_temp);
162+
163+
return 0;
164+
}
165+
166+
OK, this is finally correct, but super tedious. So we took care of all the boilerplate and underlying details for you. The final result is at least recognizable when compared to the original code. The added benefit is that you can use a `chai::managed_ptr` on the host AND the device.
167+
168+
.. code-block:: cpp
169+
170+
__global__ void callVirtualFunction(chai::managed_ptr<MyBaseClass> myBaseClass) {
171+
myBaseClass->getValue();
172+
}
173+
174+
int main(int argc, char** argv) {
175+
chai::managed_ptr<MyBaseClass> myBaseClass = chai::make_managed<MyDerivedClass>(0);
176+
myBaseClass->getValue(); // Accessible on the host
177+
callVirtualFunction<<<1, 1>>>(myBaseClass); // Accessible on the device
178+
myBaseClass.free();
179+
return 0;
180+
}
181+
182+
OK, so we didn't do all the work for you, but we definitely gave you a leg up. What's left for you to do? You just need to make sure the functions accessed on the device have the `__device__` specifier (including constructors and destructors). We use the `CHAI_HOST_DEVICE` macro in this example, which actually annotates the functions as `__host__ __device__` so we can call the virtual method on both the host and the device. You also need to make sure the destructors of all base classes are virtual so the object gets cleaned up properly on the device.
183+
184+
.. code-block:: cpp
185+
186+
class MyBaseClass {
187+
public:
188+
CARE_HOST_DEVICE MyBaseClass() {}
189+
CARE_HOST_DEVICE virtual ~MyBaseClass() {}
190+
CARE_HOST_DEVICE virtual int getValue() const = 0;
191+
};
192+
193+
class MyDerivedClass : public MyBaseClass {
194+
public:
195+
CARE_HOST_DEVICE MyDerivedClass(int value) : MyBaseClass(), m_value(value) {}
196+
CARE_HOST_DEVICE ~MyDerivedClass() {}
197+
CARE_HOST_DEVICE int getValue() const { return m_value; }
198+
199+
private:
200+
int m_value;
201+
};
202+
203+
Now you may rightfully ask, what happens when this class contains raw pointers? There is a convenient solution for this case and we demonstrate with a more interesting example.
204+
205+
.. code-block:: cpp
206+
207+
class MyBaseClass {
208+
public:
209+
CARE_HOST_DEVICE MyBaseClass() {}
210+
CARE_HOST_DEVICE virtual ~MyBaseClass() {}
211+
CARE_HOST_DEVICE virtual int getScalarValue() const = 0;
212+
CARE_HOST_DEVICE virtual int getArrayValue(int index) const = 0;
213+
};
214+
215+
class MyDerivedClass : public MyBaseClass {
216+
public:
217+
CARE_HOST_DEVICE MyDerivedClass(int scalarValue, int* arrayValue)
218+
: MyBaseClass(), m_scalarValue(scalarValue), m_arrayValue(arrayValue) {}
219+
CARE_HOST_DEVICE ~MyDerivedClass() {}
220+
CARE_HOST_DEVICE int getScalarValue() const { return m_scalarValue; }
221+
CARE_HOST_DEVICE int getArrayValue() const { return m_arrayValue; }
222+
223+
private:
224+
int m_scalarValue;
225+
int* m_arrayValue;
226+
};
227+
228+
__global__ void callVirtualFunction(chai::managed_ptr<MyBaseClass> myBaseClass) {
229+
int i = blockIdx.x*blockDim.x + threadIdx.x;
230+
myBaseClass->getScalarValue();
231+
myBaseClass->getArrayValue(i);
232+
}
233+
234+
int main(int argc, char** argv) {
235+
chai::ManagedArray<int> arrayValue(10);
236+
chai::managed_ptr<MyBaseClass> myBaseClass
237+
= chai::make_managed<MyDerivedClass>(0, chai::unpack(arrayValue));
238+
callVirtualFunction<<<1, 10>>>(myBaseClass);
239+
myBaseClass.free();
240+
arrayValue.free();
241+
return 0;
242+
}
243+
244+
The respective host and device pointers contained in the `chai::ManagedArray` can be extracted and passed to the host and device instance of `MyDerivedClass` using `chai::unpack`. Of course, if you never dereference `m_arrayValue` on the device, you could simply pass a raw pointer to `chai::make_managed`. If the class contains a `chai::ManagedArray`, a `chai::ManagedArray` can simply be passed to the constructor. The same rules apply for passing a `chai::managed_ptr`, calling `chai::unpack` on a `chai::managed_ptr`, or passing a raw pointer and not accessing it on the device.
245+
246+
More complicated rules apply for keeping the data in sync between the host and device instances of an object, but it is possible to do so to a limited extent. It is also possible to control the lifetimes of objects passed to `chai::make_managed`.
247+
248+
.. code-block:: cpp
249+
int main(int argc, char** argv) {
250+
chai::ManagedArray<int> arrayValue(10);
251+
252+
chai::managed_ptr<MyBaseClass> myBaseClass
253+
= chai::make_managed<MyDerivedClass>(0, chai::unpack(arrayValue));
254+
myBaseClass.set_callback([=] (chai::Action action, chai::ExecutionSpace space, void*) mutable {
255+
if (action == chai::ACTION_MOVE) {
256+
(void) chai::ManagedArray<int> temp(arrayValue); // Copy constructor triggers movement
257+
}
258+
else if (action == chai::ACTION_FREE && space == chai::NONE) {
259+
temp.free();
260+
}
261+
262+
return false;
263+
});
264+
265+
callVirtualFunction<<<1, 10>>>(myBaseClass);
266+
myBaseClass.free();
267+
// arrayValue.free(); // Not needed anymore
268+
return 0;
269+
}

scripts/make_release_tarball.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
##############################################################################
88

99
TAR_CMD=gtar
10-
VERSION=2.2.1
10+
VERSION=2.2.2
1111

1212
git archive --prefix=chai-${VERSION}/ -o chai-${VERSION}.tar HEAD 2> /dev/null
1313

1414
echo "Running git archive submodules..."
1515

16-
p=`pwd` && (echo .; git submodule foreach) | while read entering path; do
16+
p=`pwd` && (echo .; git submodule foreach --recursive) | while read entering path; do
1717
temp="${path%\'}";
1818
temp="${temp#\'}";
1919
path=$temp;

src/chai/ArrayManager.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,11 @@ PointerRecord* ArrayManager::deepCopyRecord(PointerRecord const* record)
445445
copy->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {};
446446

447447
const ExecutionSpace last_space = record->m_last_space;
448-
449448
copy->m_last_space = last_space;
449+
for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
450+
copy->m_allocators[space] = record->m_allocators[space];
451+
}
452+
450453
allocate(copy, last_space);
451454

452455
for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {

src/chai/ArrayManager.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ inline void synchronize() {
7575
#endif
7676
}
7777

78+
#if defined(CHAI_GPUCC)
79+
7880
// wrapper for hip/cuda free
7981
CHAI_HOST inline void gpuFree(void* buffer) {
8082
#if defined (CHAI_ENABLE_HIP)
@@ -111,6 +113,8 @@ CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMe
111113
#endif
112114
}
113115

116+
#endif //#if defined(CHAI_GPUCC)
117+
114118
/*!
115119
* \brief Singleton that manages caching and movement of ManagedArray objects.
116120
*

src/chai/ManagedArray.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@ class ManagedArray : public CHAICopyable
173173
*/
174174
CHAI_HOST void registerTouch(ExecutionSpace space);
175175

176-
CHAI_HOST void move(ExecutionSpace space=NONE) const;
176+
CHAI_HOST void move(ExecutionSpace space=NONE,
177+
bool registerTouch=!std::is_const<T>::value) const;
177178

178179
CHAI_HOST_DEVICE ManagedArray<T> slice(size_t begin, size_t elems=(size_t)-1) const;
179180

@@ -207,6 +208,15 @@ class ManagedArray : public CHAICopyable
207208
*/
208209
CHAI_HOST_DEVICE T* data() const;
209210

211+
/*!
212+
* \brief Move data to the current execution space (actually determined
213+
* by where the code is executing) and return a raw pointer. Do
214+
* not mark data as touched since a pointer to const is returned.
215+
*
216+
* \return Raw pointer to data in the current execution space
217+
*/
218+
CHAI_HOST_DEVICE const T* cdata() const;
219+
210220
/*!
211221
* \brief Return the raw pointer to the data in the given execution
212222
* space. Optionally move the data to that execution space.

0 commit comments

Comments
 (0)