Skip to content

Commit 055dd51

Browse files
author
Your Name
committed
tc comprehension integration Ref. SINGA-482
1 parent 806dbe7 commit 055dd51

File tree

7 files changed

+573
-3
lines changed

7 files changed

+573
-3
lines changed

cmake/Dependencies.cmake

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,47 @@ IF(USE_MKLDNN)
149149
INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
150150
LIST(APPEND SINGA_LINKER_LIBS ${MKLDNN_LIBRARIES})
151151
ENDIF()
152+
153+
154+
### Tensor comprehensions
155+
### Tensor comprehensions
156+
### Tensor comprehensions
157+
# the path should be consistent with the inlcude path in src
158+
INCLUDE_DIRECTORIES(/root/TensorComprehensions)
159+
INCLUDE_DIRECTORIES(/root/TensorComprehensions/tc/version)
160+
INCLUDE_DIRECTORIES(/root/TensorComprehensions/build)
161+
162+
# polyhedral model required
163+
INCLUDE_DIRECTORIES(/root/TensorComprehensions/isl_interface/include)
164+
165+
# dlpack
166+
INCLUDE_DIRECTORIES(/root/TensorComprehensions/third-party/dlpack/include)
167+
# Halide
168+
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/include/Halide)
169+
170+
# llvm
171+
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/include)
172+
173+
# torch ATen header TO DELETE
174+
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/lib/python3.6/site-packages/torch/lib/include)
175+
176+
# find Halide lib
177+
set(HALIDE_PREFIX "/root/conda/envs/tc_build")
178+
find_library(HALIDE_LIBRARIES REQUIRED NAMES Halide PATHS ${HALIDE_PREFIX} PATH_SUFFIXES lib lib64 NO_DEFAULT_PATH)
179+
message(STATUS "Found Halide.so file: ${HALIDE_LIBRARIES}")
180+
181+
# find tc lib
182+
link_directories(/root/TensorComprehensions/build/tc/aten)
183+
link_directories(/root/TensorComprehensions/build/tc/lang)
184+
link_directories(/root/TensorComprehensions/build/tc/core)
185+
link_directories(/root/TensorComprehensions/build/tc/autotuner)
186+
link_directories(/root/TensorComprehensions/build/tc/proto)
187+
188+
# torch(aten) lib to delete
189+
link_directories(/root/conda/envs/tc_build/lib/python3.6/site-packages/torch/lib)
190+
191+
LIST(APPEND SINGA_LINKER_LIBS ${HALIDE_LIBRARIES} tc_aten tc_lang tc_core_cpu tc_cuda tc_core_cuda_no_sdk tc_core tc_autotuner tc_proto ATen)
192+
193+
### Tensor comprehensions
194+
### Tensor comprehensions
195+
### Tensor comprehensions

include/singa/core/tensor.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@
2323
#include <tuple>
2424
#include <memory>
2525

26+
#include <dlpack/dlpack.h>
27+
#include <tc/core/tensor.h>
28+
#include <tc/utils/compiler_options.h>
29+
#include <tc/core/compiler.h>
30+
#include <tc/core/utils/time.h>
31+
#include <tc/core/cuda/cuda_backend.h>
32+
#include <tc/core/cuda/cuda_tc_executor.h>
33+
2634
#include "singa/core/common.h"
2735
#include "singa/core/device.h"
2836
#include "singa/proto/core.pb.h"
@@ -147,6 +155,7 @@ class Tensor {
147155

148156
/// Return average L2 norm
149157
float L2() const;
158+
150159
// --------------------------------------------------------------------------
151160
// ---Following methods changes the internal data
152161
// --------------------------------------------------------------------------
@@ -603,6 +612,90 @@ Tensor ConcatRows(const vector<Tensor> &in);
603612
Tensor ConcatenateColumns(const vector<Tensor> &in);
604613
/// Alias name for function ConcatenateColumns
605614
Tensor ConcatColumns(const vector<Tensor> &in);
615+
616+
617+
618+
619+
/// tc integration start
620+
DLManagedTensor *toDLPack(const Tensor &src);
621+
// Tensor fromDLPack(const DLManagedTensor* src);
622+
623+
inline std::vector<tc::DLTensorUPtr>
624+
makeDLTensors(const std::vector<Tensor> &tensors);
625+
626+
template <typename Backend>
627+
std::unique_ptr<typename Backend::ExecutorType>
628+
compileTC(const std::string &tc, const std::string &entryPoint,
629+
const std::vector<Tensor> &inputs,
630+
const typename Backend::MappingOptionsType &options,
631+
const tc::CompilerOptions &compilerOptions = tc::CompilerOptions());
632+
633+
std::vector<tc::DLTensorUPtr>
634+
inferOutputTensorInfo(const std::string &tc, const std::string &entryPoint,
635+
const std::vector<Tensor> &inputs);
636+
637+
std::vector<Tensor> prepareOutputs(const std::string &tc,
638+
const std::string &entryPoint,
639+
const std::vector<Tensor> &inputs);
640+
641+
template <typename Executor>
642+
void runTC(const Executor &executor, const std::vector<Tensor> &inputs,
643+
std::vector<Tensor> &outputs);
644+
645+
// tensor comprehension operations
646+
Tensor SoftMaxTC(const Tensor &in);
647+
Tensor ReluTC(const Tensor &in);
648+
Tensor MatMulTC(const Tensor &in1, const Tensor &in2);
649+
Tensor FCTC(const Tensor &x, const Tensor &W, const Tensor &b);
650+
651+
// makeDLConstTensors implementation
652+
inline std::vector<tc::DLConstTensorUPtr>
653+
makeDLConstTensors(const std::vector<Tensor> &tensors) {
654+
std::vector<tc::DLConstTensorUPtr> dlTensors;
655+
for (auto tensor : tensors) {
656+
auto dlMTensor = toDLPack(tensor);
657+
dlTensors.push_back(tc::makeDLConstTensor(&(dlMTensor->dl_tensor)));
658+
dlMTensor->deleter(dlMTensor);
659+
}
660+
return dlTensors;
661+
}
662+
663+
// makeDLTensors implementation
664+
inline std::vector<tc::DLTensorUPtr>
665+
makeDLTensors(const std::vector<Tensor> &tensors) {
666+
std::vector<tc::DLTensorUPtr> dlTensors;
667+
for (auto tensor : tensors) {
668+
auto dlMTensor = toDLPack(tensor);
669+
dlTensors.push_back(tc::makeDLTensor(&(dlMTensor->dl_tensor)));
670+
dlMTensor->deleter(dlMTensor);
671+
}
672+
return dlTensors;
673+
}
674+
675+
// compile implementation
676+
template <typename Backend>
677+
std::unique_ptr<typename Backend::ExecutorType>
678+
compileTC(const std::string &tc, const std::string &entryPoint,
679+
const std::vector<Tensor> &inputs,
680+
const typename Backend::MappingOptionsType &options,
681+
const tc::CompilerOptions &compilerOptions) {
682+
auto inputDLTensors = makeDLConstTensors(inputs);
683+
return tc::compile<Backend>(tc, entryPoint, extractRawPtrs(inputDLTensors),
684+
options, compilerOptions);
685+
}
686+
687+
// run implementation
688+
template <typename Executor>
689+
void runTC(const Executor &executor, const std::vector<Tensor> &inputs,
690+
std::vector<Tensor> &outputs) {
691+
auto inputDLTensors = makeDLConstTensors(inputs);
692+
auto outputDLTensors = makeDLTensors(outputs);
693+
return executor.run(extractRawPtrs(inputDLTensors),
694+
extractRawPtrs(outputDLTensors));
695+
}
696+
697+
/// tc integration end
698+
606699
} // namespace singa
607700

608701
#endif // SINGA_CORE_TENSOR_H_

src/api/core_tensor.i

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,4 +345,26 @@ namespace singa{
345345

346346
Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
347347
Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);
348+
349+
/* ============ Tensor Comprehensions ============ */
350+
/* /root/incubator-singa/build/src/api/singa_wrap.cxx:14938:166: error: use of deleted function */
351+
/* due to below issue, abort this approach
352+
std::vector<Tensor> prepareOutputs(
353+
const std::string& tc,
354+
const std::string& entryPoint,
355+
const std::vector<Tensor>& inputs);
356+
357+
template <typename Executor>
358+
void runTC( const Executor& executor, const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs);
359+
%template(runTCCuda) runTC<tc::CudaTcExecutor>;
360+
361+
template <typename Backend>
362+
std::unique_ptr<typename Backend::ExecutorType> compileTC(
363+
const std::string& tc,
364+
const std::string& entryPoint,
365+
const std::vector<Tensor>& inputs,
366+
const typename Backend::MappingOptionsType& options,
367+
const tc::CompilerOptions& compilerOptions = tc::CompilerOptions());
368+
%template(compileTCCuda) compileTC<tc::CudaBackend>;
369+
*/
348370
}

src/core/tensor/tensor.cc

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,21 @@
2121
#include "./tensor_math_cpp.h"
2222
#include "./tensor_math_cuda.h"
2323
#include "./tensor_math_opencl.h"
24+
2425
#include <utility>
2526
#include <algorithm>
2627

28+
#include <tc/core/check.h>
29+
#include <tc/core/compiler.h>
30+
#include <tc/core/tc_executor.h>
31+
#include <tc/core/tensor.h>
2732

2833
#define Noaxis 9999
2934

35+
// namespace is already exist in singa
36+
// aliasing to avoid duplicates
37+
namespace tclang = lang;
38+
3039
namespace singa {
3140

3241
Tensor::~Tensor() {
@@ -1334,4 +1343,183 @@ Tensor Reshape(const Tensor &in, const Shape &s) {
13341343
return out.Reshape(s);
13351344
}
13361345

1346+
1347+
/// tc integration start
1348+
struct SingaDLManagedTensor {
1349+
Tensor handle;
1350+
DLManagedTensor tensor;
1351+
};
1352+
1353+
void deleter(DLManagedTensor *arg) {
1354+
delete static_cast<SingaDLManagedTensor *>(arg->manager_ctx);
1355+
}
1356+
1357+
static DLDataType getDLDataType(const Tensor &t) {
1358+
DLDataType dtype;
1359+
dtype.lanes = 1;
1360+
// TODO: get the number of bytes of the datatype
1361+
// dtype.bits = t.data_type() * 8;
1362+
dtype.bits = 4 * 8;
1363+
switch (t.data_type()) {
1364+
case kFloat32:
1365+
dtype.code = DLDataTypeCode::kDLFloat;
1366+
break;
1367+
default:
1368+
throw std::logic_error("only kFloat32 is supported for dlpack conversion");
1369+
break;
1370+
}
1371+
return dtype;
1372+
}
1373+
1374+
static DLContext getDLContext(const Tensor &tensor, const int64_t &device_id) {
1375+
DLContext ctx;
1376+
ctx.device_id = device_id;
1377+
ctx.device_type = DLDeviceType::kDLGPU;
1378+
// TODO: fix this
1379+
// if (tensor.is_cuda()) {
1380+
// ctx.device_type = DLDeviceType::kDLGPU;
1381+
//} else {
1382+
// ctx.device_type = DLDeviceType::kDLCPU;
1383+
//}
1384+
return ctx;
1385+
}
1386+
1387+
// This function returns a shared_ptr to memory managed DLpack tensor
1388+
// constructed out of ATen tensor
1389+
DLManagedTensor *toDLPack(const Tensor &src) {
1390+
SingaDLManagedTensor *singaDLManagedTensor(new SingaDLManagedTensor);
1391+
singaDLManagedTensor->handle = src;
1392+
singaDLManagedTensor->tensor.manager_ctx = singaDLManagedTensor;
1393+
singaDLManagedTensor->tensor.deleter = &deleter;
1394+
singaDLManagedTensor->tensor.dl_tensor.data = src.block()->mutable_data();
1395+
int64_t device_id = 0;
1396+
// TODO: fix this
1397+
// if (src.is_cuda()) {
1398+
// device_id = src.get_device();
1399+
//}
1400+
singaDLManagedTensor->tensor.dl_tensor.ctx = getDLContext(src, device_id);
1401+
singaDLManagedTensor->tensor.dl_tensor.ndim = src.nDim();
1402+
singaDLManagedTensor->tensor.dl_tensor.dtype = getDLDataType(src);
1403+
1404+
auto shapeVec =
1405+
new std::vector<int64_t>(src.shape().begin(), src.shape().end());
1406+
singaDLManagedTensor->tensor.dl_tensor.shape = shapeVec->data();
1407+
1408+
auto strideVec =
1409+
new std::vector<int64_t>(src.stride().begin(), src.stride().end());
1410+
singaDLManagedTensor->tensor.dl_tensor.strides = strideVec->data();
1411+
1412+
singaDLManagedTensor->tensor.dl_tensor.byte_offset = 0;
1413+
return &(singaDLManagedTensor->tensor);
1414+
}
1415+
1416+
// prepare output
1417+
std::vector<tc::DLTensorUPtr>
1418+
inferOutputTensorInfo(const std::string &tc, const std::string &entryPoint,
1419+
const std::vector<Tensor> &inputs) {
1420+
auto parsedTcs = tc::detail::parse(tc);
1421+
if (parsedTcs.count(entryPoint) != 1u) {
1422+
TC_CHECK_GE(parsedTcs.size(), 1u)
1423+
<< "No TC was parsed, should have thrown earlier";
1424+
throw tclang::ErrorReport(parsedTcs.begin()->second)
1425+
<< "\nattempting to access undefined entryPoint: " << entryPoint;
1426+
}
1427+
auto inputDLTensors = makeDLConstTensors(inputs);
1428+
return makeDLTensorVector(tc::detail::inferOutputTensorInfo(
1429+
parsedTcs.at(entryPoint), extractRawPtrs(inputDLTensors)));
1430+
}
1431+
1432+
std::vector<Tensor> prepareOutputs(const std::string &tc,
1433+
const std::string &entryPoint,
1434+
const std::vector<Tensor> &inputs) {
1435+
std::vector<Tensor> outputs;
1436+
auto outTensorInfo = inferOutputTensorInfo(tc, entryPoint, inputs);
1437+
if (outTensorInfo.size() == 0) {
1438+
return outputs;
1439+
}
1440+
TC_CHECK_GE(inputs.size(), 1u)
1441+
<< "NYI: Need >= 1 input tensors to determine "
1442+
<< "backend and prepare ATen outputs. Add an overload with just an ATen "
1443+
<< "backend";
1444+
1445+
auto dev = inputs[0].device();
1446+
auto dtype = inputs[0].data_type();
1447+
for (size_t i = 0; i < outTensorInfo.size(); ++i) {
1448+
tc::TensorInfo info(outTensorInfo[i]);
1449+
Shape shape(info.shape.begin(), info.shape.end());
1450+
1451+
Tensor tmp(shape, dev, dtype);
1452+
outputs.push_back(tmp);
1453+
}
1454+
return outputs;
1455+
}
1456+
1457+
// examples of TC operations
1458+
Tensor SoftMaxTC(const Tensor &in) {
1459+
std::string tc =
1460+
1461+
R"TC(
1462+
def softmax(float(N, D) I) -> (O, expsum, maxVal) {
1463+
maxVal(n) max=! I(n, d)
1464+
expsum(n) +=! exp(I(n, d) - maxVal(n))
1465+
O(n, d) = exp(I(n, d) - maxVal(n)) / expsum(n)
1466+
}
1467+
)TC";
1468+
auto naiveOptions =
1469+
tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
1470+
auto pExecutor =
1471+
singa::compileTC<tc::CudaBackend>(tc, "softmax", {in}, {naiveOptions});
1472+
auto outputs = singa::prepareOutputs(tc, "softmax", {in});
1473+
singa::runTC(*pExecutor, {in}, outputs);
1474+
return outputs[0];
1475+
}
1476+
1477+
Tensor ReluTC(const Tensor &in) {
1478+
std::string tc = R"TC(
1479+
def relu(float(B,M) I) -> (O1){
1480+
O1(b, m) = fmax(I(b, m), 0)
1481+
}
1482+
)TC";
1483+
auto naiveOptions =
1484+
tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
1485+
auto pExecutor =
1486+
singa::compileTC<tc::CudaBackend>(tc, "relu", {in}, {naiveOptions});
1487+
auto outputs = singa::prepareOutputs(tc, "relu", {in});
1488+
singa::runTC(*pExecutor, {in}, outputs);
1489+
return outputs[0];
1490+
}
1491+
1492+
Tensor MatMulTC(const Tensor &in1, const Tensor &in2) {
1493+
std::string tc = R"TC(
1494+
def matmul(float(M,N) A, float(N,K) B) -> (output) {
1495+
output(i, j) +=! A(i, kk) * B(kk, j)
1496+
}
1497+
)TC";
1498+
auto naiveOptions =
1499+
tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
1500+
auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "matmul", {in1, in2},
1501+
{naiveOptions});
1502+
auto outputs = singa::prepareOutputs(tc, "matmul", {in1, in2});
1503+
singa::runTC(*pExecutor, {in1, in2}, outputs);
1504+
return outputs[0];
1505+
}
1506+
1507+
Tensor FCTC(const Tensor &x, const Tensor &W, const Tensor &b) {
1508+
std::string tc = R"TC(
1509+
def fc(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) {
1510+
O1(b, n) +=! I(b, m) * W1(n, m)
1511+
O1(b, n) = O1(b, n) + B1(n)
1512+
}
1513+
)TC";
1514+
auto naiveOptions =
1515+
tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
1516+
auto pExecutor =
1517+
singa::compileTC<tc::CudaBackend>(tc, "fc", {x, W, b}, {naiveOptions});
1518+
auto outputs = singa::prepareOutputs(tc, "fc", {x, W, b});
1519+
singa::runTC(*pExecutor, {x, W, b}, outputs);
1520+
return outputs[0];
1521+
}
1522+
/// tc integration end
1523+
1524+
13371525
} // namespace singa

0 commit comments

Comments
 (0)