|
21 | 21 | #include "./tensor_math_cpp.h" |
22 | 22 | #include "./tensor_math_cuda.h" |
23 | 23 | #include "./tensor_math_opencl.h" |
| 24 | + |
24 | 25 | #include <utility> |
25 | 26 | #include <algorithm> |
26 | 27 |
|
| 28 | +#include <tc/core/check.h> |
| 29 | +#include <tc/core/compiler.h> |
| 30 | +#include <tc/core/tc_executor.h> |
| 31 | +#include <tc/core/tensor.h> |
27 | 32 |
|
28 | 33 | #define Noaxis 9999 |
29 | 34 |
|
| 35 | +// namespace is already exist in singa |
| 36 | +// aliasing to avoid duplicates |
| 37 | +namespace tclang = lang; |
| 38 | + |
30 | 39 | namespace singa { |
31 | 40 |
|
32 | 41 | Tensor::~Tensor() { |
@@ -1334,4 +1343,183 @@ Tensor Reshape(const Tensor &in, const Shape &s) { |
1334 | 1343 | return out.Reshape(s); |
1335 | 1344 | } |
1336 | 1345 |
|
| 1346 | + |
| 1347 | +/// tc integration start |
| 1348 | +struct SingaDLManagedTensor { |
| 1349 | + Tensor handle; |
| 1350 | + DLManagedTensor tensor; |
| 1351 | +}; |
| 1352 | + |
| 1353 | +void deleter(DLManagedTensor *arg) { |
| 1354 | + delete static_cast<SingaDLManagedTensor *>(arg->manager_ctx); |
| 1355 | +} |
| 1356 | + |
| 1357 | +static DLDataType getDLDataType(const Tensor &t) { |
| 1358 | + DLDataType dtype; |
| 1359 | + dtype.lanes = 1; |
| 1360 | + // TODO: get the number of bytes of the datatype |
| 1361 | + // dtype.bits = t.data_type() * 8; |
| 1362 | + dtype.bits = 4 * 8; |
| 1363 | + switch (t.data_type()) { |
| 1364 | + case kFloat32: |
| 1365 | + dtype.code = DLDataTypeCode::kDLFloat; |
| 1366 | + break; |
| 1367 | + default: |
| 1368 | + throw std::logic_error("only kFloat32 is supported for dlpack conversion"); |
| 1369 | + break; |
| 1370 | + } |
| 1371 | + return dtype; |
| 1372 | +} |
| 1373 | + |
| 1374 | +static DLContext getDLContext(const Tensor &tensor, const int64_t &device_id) { |
| 1375 | + DLContext ctx; |
| 1376 | + ctx.device_id = device_id; |
| 1377 | + ctx.device_type = DLDeviceType::kDLGPU; |
| 1378 | + // TODO: fix this |
| 1379 | + // if (tensor.is_cuda()) { |
| 1380 | + // ctx.device_type = DLDeviceType::kDLGPU; |
| 1381 | + //} else { |
| 1382 | + // ctx.device_type = DLDeviceType::kDLCPU; |
| 1383 | + //} |
| 1384 | + return ctx; |
| 1385 | +} |
| 1386 | + |
| 1387 | +// This function returns a shared_ptr to memory managed DLpack tensor |
| 1388 | +// constructed out of ATen tensor |
| 1389 | +DLManagedTensor *toDLPack(const Tensor &src) { |
| 1390 | + SingaDLManagedTensor *singaDLManagedTensor(new SingaDLManagedTensor); |
| 1391 | + singaDLManagedTensor->handle = src; |
| 1392 | + singaDLManagedTensor->tensor.manager_ctx = singaDLManagedTensor; |
| 1393 | + singaDLManagedTensor->tensor.deleter = &deleter; |
| 1394 | + singaDLManagedTensor->tensor.dl_tensor.data = src.block()->mutable_data(); |
| 1395 | + int64_t device_id = 0; |
| 1396 | + // TODO: fix this |
| 1397 | + // if (src.is_cuda()) { |
| 1398 | + // device_id = src.get_device(); |
| 1399 | + //} |
| 1400 | + singaDLManagedTensor->tensor.dl_tensor.ctx = getDLContext(src, device_id); |
| 1401 | + singaDLManagedTensor->tensor.dl_tensor.ndim = src.nDim(); |
| 1402 | + singaDLManagedTensor->tensor.dl_tensor.dtype = getDLDataType(src); |
| 1403 | + |
| 1404 | + auto shapeVec = |
| 1405 | + new std::vector<int64_t>(src.shape().begin(), src.shape().end()); |
| 1406 | + singaDLManagedTensor->tensor.dl_tensor.shape = shapeVec->data(); |
| 1407 | + |
| 1408 | + auto strideVec = |
| 1409 | + new std::vector<int64_t>(src.stride().begin(), src.stride().end()); |
| 1410 | + singaDLManagedTensor->tensor.dl_tensor.strides = strideVec->data(); |
| 1411 | + |
| 1412 | + singaDLManagedTensor->tensor.dl_tensor.byte_offset = 0; |
| 1413 | + return &(singaDLManagedTensor->tensor); |
| 1414 | +} |
| 1415 | + |
| 1416 | +// prepare output |
| 1417 | +std::vector<tc::DLTensorUPtr> |
| 1418 | +inferOutputTensorInfo(const std::string &tc, const std::string &entryPoint, |
| 1419 | + const std::vector<Tensor> &inputs) { |
| 1420 | + auto parsedTcs = tc::detail::parse(tc); |
| 1421 | + if (parsedTcs.count(entryPoint) != 1u) { |
| 1422 | + TC_CHECK_GE(parsedTcs.size(), 1u) |
| 1423 | + << "No TC was parsed, should have thrown earlier"; |
| 1424 | + throw tclang::ErrorReport(parsedTcs.begin()->second) |
| 1425 | + << "\nattempting to access undefined entryPoint: " << entryPoint; |
| 1426 | + } |
| 1427 | + auto inputDLTensors = makeDLConstTensors(inputs); |
| 1428 | + return makeDLTensorVector(tc::detail::inferOutputTensorInfo( |
| 1429 | + parsedTcs.at(entryPoint), extractRawPtrs(inputDLTensors))); |
| 1430 | +} |
| 1431 | + |
| 1432 | +std::vector<Tensor> prepareOutputs(const std::string &tc, |
| 1433 | + const std::string &entryPoint, |
| 1434 | + const std::vector<Tensor> &inputs) { |
| 1435 | + std::vector<Tensor> outputs; |
| 1436 | + auto outTensorInfo = inferOutputTensorInfo(tc, entryPoint, inputs); |
| 1437 | + if (outTensorInfo.size() == 0) { |
| 1438 | + return outputs; |
| 1439 | + } |
| 1440 | + TC_CHECK_GE(inputs.size(), 1u) |
| 1441 | + << "NYI: Need >= 1 input tensors to determine " |
| 1442 | + << "backend and prepare ATen outputs. Add an overload with just an ATen " |
| 1443 | + << "backend"; |
| 1444 | + |
| 1445 | + auto dev = inputs[0].device(); |
| 1446 | + auto dtype = inputs[0].data_type(); |
| 1447 | + for (size_t i = 0; i < outTensorInfo.size(); ++i) { |
| 1448 | + tc::TensorInfo info(outTensorInfo[i]); |
| 1449 | + Shape shape(info.shape.begin(), info.shape.end()); |
| 1450 | + |
| 1451 | + Tensor tmp(shape, dev, dtype); |
| 1452 | + outputs.push_back(tmp); |
| 1453 | + } |
| 1454 | + return outputs; |
| 1455 | +} |
| 1456 | + |
| 1457 | +// examples of TC operations |
| 1458 | +Tensor SoftMaxTC(const Tensor &in) { |
| 1459 | + std::string tc = |
| 1460 | + |
| 1461 | + R"TC( |
| 1462 | +def softmax(float(N, D) I) -> (O, expsum, maxVal) { |
| 1463 | + maxVal(n) max=! I(n, d) |
| 1464 | + expsum(n) +=! exp(I(n, d) - maxVal(n)) |
| 1465 | + O(n, d) = exp(I(n, d) - maxVal(n)) / expsum(n) |
| 1466 | +} |
| 1467 | +)TC"; |
| 1468 | + auto naiveOptions = |
| 1469 | + tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1470 | + auto pExecutor = |
| 1471 | + singa::compileTC<tc::CudaBackend>(tc, "softmax", {in}, {naiveOptions}); |
| 1472 | + auto outputs = singa::prepareOutputs(tc, "softmax", {in}); |
| 1473 | + singa::runTC(*pExecutor, {in}, outputs); |
| 1474 | + return outputs[0]; |
| 1475 | +} |
| 1476 | + |
| 1477 | +Tensor ReluTC(const Tensor &in) { |
| 1478 | + std::string tc = R"TC( |
| 1479 | +def relu(float(B,M) I) -> (O1){ |
| 1480 | + O1(b, m) = fmax(I(b, m), 0) |
| 1481 | +} |
| 1482 | + )TC"; |
| 1483 | + auto naiveOptions = |
| 1484 | + tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1485 | + auto pExecutor = |
| 1486 | + singa::compileTC<tc::CudaBackend>(tc, "relu", {in}, {naiveOptions}); |
| 1487 | + auto outputs = singa::prepareOutputs(tc, "relu", {in}); |
| 1488 | + singa::runTC(*pExecutor, {in}, outputs); |
| 1489 | + return outputs[0]; |
| 1490 | +} |
| 1491 | + |
| 1492 | +Tensor MatMulTC(const Tensor &in1, const Tensor &in2) { |
| 1493 | + std::string tc = R"TC( |
| 1494 | +def matmul(float(M,N) A, float(N,K) B) -> (output) { |
| 1495 | + output(i, j) +=! A(i, kk) * B(kk, j) |
| 1496 | +} |
| 1497 | + )TC"; |
| 1498 | + auto naiveOptions = |
| 1499 | + tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1500 | + auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "matmul", {in1, in2}, |
| 1501 | + {naiveOptions}); |
| 1502 | + auto outputs = singa::prepareOutputs(tc, "matmul", {in1, in2}); |
| 1503 | + singa::runTC(*pExecutor, {in1, in2}, outputs); |
| 1504 | + return outputs[0]; |
| 1505 | +} |
| 1506 | + |
| 1507 | +Tensor FCTC(const Tensor &x, const Tensor &W, const Tensor &b) { |
| 1508 | + std::string tc = R"TC( |
| 1509 | +def fc(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) { |
| 1510 | + O1(b, n) +=! I(b, m) * W1(n, m) |
| 1511 | + O1(b, n) = O1(b, n) + B1(n) |
| 1512 | +} |
| 1513 | + )TC"; |
| 1514 | + auto naiveOptions = |
| 1515 | + tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions(); |
| 1516 | + auto pExecutor = |
| 1517 | + singa::compileTC<tc::CudaBackend>(tc, "fc", {x, W, b}, {naiveOptions}); |
| 1518 | + auto outputs = singa::prepareOutputs(tc, "fc", {x, W, b}); |
| 1519 | + singa::runTC(*pExecutor, {x, W, b}, outputs); |
| 1520 | + return outputs[0]; |
| 1521 | +} |
| 1522 | +/// tc integration end |
| 1523 | + |
| 1524 | + |
1337 | 1525 | } // namespace singa |
0 commit comments