Commit 4b54d3d0 by linye

update Sum, Clip, Div, Negate, ScaleAndShift, MultiplyDim, MatrixMul,…

update Sum, Clip, Div, Negate, ScaleAndShift, MultiplyDim, MatrixMul, LogSoftmax, HardTanH, ReduceSum, ReduceMax, ConvertDataType
parent 2b53f214
......@@ -38,6 +38,16 @@ void SumDimTest();
void SplitBackwardTest();
void MemTest();
void xcTest();
void ConvertDataTypeTest();
void ConvertDataTypeBackwardTest();
void SumFP16Test();
void GatherFP16Test();
void HardTanHFP16Test();
void ReduceMaxFP16Test();
void ReduceSumFP16Test();
void LogSoftmaxFP16Test();
void ClipFP16Test();
void ScaleAndShiftFP16Test();
using namespace nts;
using namespace fnnlm;
......@@ -56,6 +66,28 @@ int main(int argc, const char ** argv )
//return 0;
//Test();
//return 0;
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
......@@ -74,6 +106,251 @@ int main(int argc, const char ** argv )
return 0;
}
void ScaleAndShiftFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = ScaleAndShift(intA, 2, 0);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void ClipFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = Clip(intA, -1, 1);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void LogSoftmaxFP16Test() {
XTensor a;
XTensor halfA;
XTensor b;
XTensor halfB;
InitTensor3D(&a, 2, 2, 2, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
halfA = ConvertDataType(a, X_FLOAT16);
b = LogSoftmax(a, 1);
halfB = LogSoftmax(halfA, 1);
b.Dump(stderr, "sum:");
halfB.Dump(&halfB, stderr, "halfSum:");
}
void ReduceSumFP16Test()
{
XTensor a;
XTensor sum;
XTensor halfA;
XTensor halfSum;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
sum = ReduceSum(a, 1);
halfSum = ReduceSum(halfA, 1);
sum.Dump(stderr, "sum:");
halfSum.Dump(&halfSum, stderr, "halfSum:");
}
void ReduceMaxFP16Test()
{
XTensor a;
XTensor max;
XTensor halfA;
XTensor halfMax;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
max = ReduceMax(a, 1);
halfMax = ReduceMax(halfA, 1);
max.Dump(stderr, "max:");
halfMax.Dump(&halfMax, stderr, "halfMax:");
}
void HardTanHFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = HardTanH(a);
halfB = HardTanH(halfA);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void GatherFP16Test() {
XTensor a;
XTensor b;
XTensor srcIndex;
XTensor halfA;
XTensor halfB;
XTensor c;
InitTensor1D(&srcIndex, 2, X_INT, 0);
int m = 0;
int n = 1;
srcIndex.Set1DInt(m, 0);
srcIndex.Set1DInt(n, 1);
InitTensor2D(&a, 3, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
InitTensor2D(&halfB, 2, 2, X_FLOAT16, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
a.Dump(stderr, "a:");
_Gather(&a, &b, &srcIndex);
b.Dump(stderr, "b:");
_Gather(&halfA, &halfB, &srcIndex);
c = ConvertDataType(halfB, X_FLOAT);
c.Dump(stderr, "c:");
}
void SumFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = Sum(a, b, -0.4F);
halfB = Sum(halfA, halfB, -0.4F);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void ConvertDataTypeTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
XTensor halfa;
InitTensor2D(&halfa, 2, 2, X_FLOAT16, 0);
XTensor a1;
InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
halfa = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfa, X_FLOAT);
a1.Dump(stderr, "halfa:");
}
}
void ConvertDataTypeBackwardTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
a.SetDataRand(2.0F, 2.0F);
a.Dump(stderr, "a:");
XTensor halfA;
XTensor a1;
halfA = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfA, X_FLOAT);
a1.grad = NewTensor(&a1);
a1.grad->SetDataRand(3.0F, 3.0F);
a1.grad->Dump(stderr, "a1.grad:");
XNet testBackward;
printf("1");
testBackward.Backward(a1);
printf("2");
halfA.grad->Dump(stderr, "halfA.grad:");
a.grad->Dump(stderr, "a.grad:");
}
}
XTensor * stack(XList& list, int leadingDim)
{
size_t size = list.count;
......
......@@ -783,7 +783,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(embedding, batchSize, model.eSize, model);
/* generate word embedding of position i:
embedding = input * w */
embedding = input * w */
_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
eList.Add(&net.embeddings[i]);
......
......@@ -249,8 +249,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
//lossTensor = CrossEntropy(output, labelOnehot);
float prob = ReduceSumAll(lossTensor);
printf("%f\n", prob);
exit(0);
DTYPE lossLocal = prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
......@@ -295,7 +293,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break;
}
if (step % 10 == 0) {
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
......
......@@ -32,6 +32,8 @@
#ifndef WIN32
#include <sys/time.h>
#include <unistd.h>
#include <stdint.h>
typedef int8_t __int8;
#endif
// the CUDA stuff
......
......@@ -48,6 +48,7 @@
#include "core/math/ScaleAndShift.h"
#include "core/getandset/SetData.h"
#include "function/Identity.h"
#include "core/getandset/ConvertDataType.h"
#ifdef USE_CUDA
......@@ -1764,9 +1765,22 @@ dump data to a file
*/
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
if (tensor->dataType == X_FLOAT)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else if (tensor->dataType == X_FLOAT16)
{
XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
_ConvertDataType(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else
{
ShowNTErrors("TO DO!");
}
}
/*
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-05 float16 added
*/
#include "../../XDevice.h"
......@@ -34,8 +35,9 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i)
>> c - result data array
>> size - size of c
*/
template <class T>
__global__
void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
void KernelDivElementWise(T * a, T * b, T * c, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -51,8 +53,9 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
>> size - size of c
>> alpha - the coefficient
*/
template <class T>
__global__
void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
void KernelDivElementWiseV2(T * a, T * b, T * c, int size, T alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -75,13 +78,13 @@ where |a_lead| means the size of the leading dimension of a
>> ldSizeC - size of the leading dimension of c
>> blockNum - number of blocks
*/
template<int nonZeroAlpha> __global__
void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
template<class T, int nonZeroAlpha> __global__
void KernelDivElementWiseTensorDynamic(T * a, T * b, T * c, T alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{
__shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -169,17 +172,48 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelDivElementWiseTensorDynamic<0> << <blocks, threads >> >
KernelDivElementWiseTensorDynamic<DTYPE, 0> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelDivElementWiseTensorDynamic<1> << <blocks, threads >> >
KernelDivElementWiseTensorDynamic<DTYPE, 1> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
int cudaGridSize[3];
int cudaBlockSize[3];
half alpha1 = __float2half(alpha);
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelDivElementWise << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum);
else
KernelDivElementWiseV2 << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum, alpha1);
}
else {
GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelDivElementWiseTensorDynamic<__half, 0> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelDivElementWiseTensorDynamic<__half, 1> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, alpha1,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
......
......@@ -29,16 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) */
__global__
void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
template<class T> __global__
void KernelDivElementWise(T * a, T * b, T * c, int size);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) */
__global__
void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
template<class T> __global__
void KernelDivElementWiseV2(T * a, T * b, T * c, int size, T alpha);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i)+ \alpha*c(i) */
template<int nonZeroAlpha>__global__
void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
template<class T, int nonZeroAlpha>__global__
void KernelDivElementWiseTensorDynamic(T * a, T * b, T * c, T alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise division of two tensors */
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/
#include "DivDim.cuh"
......@@ -168,6 +169,34 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
half alpha1 = __float2half(alpha);
if (stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
else
KernelDivWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
}
else if (stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
else
KernelDivWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -54,8 +54,6 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors(a && b && c, "Empty input tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Input tensors should have the same data type!");
CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
"Input tensors must have a order >= 2!");
CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
......@@ -302,6 +300,63 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
return c;
}
/*
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> dataType - indicates what datatype is needed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
TENSOR_DATA_TYPE dataType, DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c,
DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
......
......@@ -59,6 +59,9 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
......
......@@ -169,6 +169,34 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
half alpha1 = __float2half(alpha);
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == 0.0F)
KernelMultiplyWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
else
KernelMultiplyWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == 0.0F)
KernelMultiplyWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
else
KernelMultiplyWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -33,8 +33,9 @@ set each entry to its negtive value (CUDA Kernel)
>> b - pointer to the output data array
>> size - size of the data array
*/
template <class T>
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size)
void KernelNegate(T * a, T * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -42,26 +43,6 @@ void KernelNegate(DTYPE * a, DTYPE * b, int size)
b[i] = -a[i];
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
b[i] = __hsub(__float2half(0), a[i]);
#else
if (i < size)
b[i] = __float2half(-__half2float(a[i]));
#endif
}
/*
set each entry to its negtive value
......
......@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its negtive value (CUDA Kernel) */
template <class T>
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * a, __half * b, int size);
void KernelNegate(T * a, T * b, int size);
/* set each entry to its negtive value */
void _CudaNegate(const XTensor * a, XTensor * b);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-02 float16/int/int8 added
*/
#include "../../XDevice.h"
......@@ -36,13 +37,25 @@ c = a + b * \beta
>> size - the size of a/b/c
>> beta - the coefficient
*/
template <class T>
__global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
void KernelADD(T * a, T * b, T * c, int size, T beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * beta;
}
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/*
......@@ -61,6 +74,11 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
"Unmatched tensors in addition!");
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The tensors must be on the same!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && c->dataType == DEFAULT_DTYPE) ||
(a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16 && c->dataType == X_FLOAT16) ||
(a->dataType == X_INT && b->dataType == X_INT && c->dataType == X_INT) ||
(a->dataType == X_INT8 && b->dataType == X_INT8 && c->dataType == X_INT8),
"The sum function does not support this datatype.");
int devIDBackup = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(a->devID);
......@@ -100,6 +118,46 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
}
else if (a->dataType == X_FLOAT16 &&
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
half beta1 = __float2half(beta);
KernelADD << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta1);
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int beta1 = (int)beta;
KernelADD << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta1);
}
else if (a->dataType == X_INT8 &&
b->dataType == X_INT8 &&
c->dataType == X_INT8)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
__int8 beta1 = (__int8)beta;
KernelADD << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, (__int8 *)c->data, a->unitNum, beta1);
}
else {
// TODO!!
ShowNTErrors("TODO!");
......
......@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */
__global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
template <class T> __global__
void KernelADD(T * a, T * b, T * c, int size, T beta = (T)1.0);
/* tensor summation c = a + b * \beta (cuda version) */
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
......@@ -67,7 +67,49 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
if (i < size){
outputData[i] = (float)(inputData[i]);
}}
}
}
__global__
void KernelFloatToInt8(float * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (float)(inputData[i]);
}
}
__global__
void KernelIntToInt8(int * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToInt(__int8 * inputData, int * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (int)(inputData[i]);
}
}
/*
data conversion (cuda code)
......@@ -138,6 +180,14 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
KernelInt8ToFloat << <blocks, threads >> >((__int8*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_INT && output->dataType == X_INT8)
KernelIntToInt8 << <blocks, threads >> >((int*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_INT)
KernelInt8ToInt << <blocks, threads >> >((__int8*)input->data, (int*)output->data, input->unitNum);
else{
ShowNTErrors("Unsupported data types for conversion!");
}
......
......@@ -245,7 +245,6 @@ void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
{
int n = tensor->order;
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
......@@ -298,7 +297,6 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
int order = source->order;
int size = source->GetDim(dim);
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
CheckNTErrors(index >= 0 && index < size, "Illegal index!");
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
*/
#include "../../XDevice.h"
......@@ -35,34 +36,20 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border
>> size - size of the data array
*/
template <class T>
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
void KernelClip(T * a, T * b, T lower, T upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (a[i] > upper)
b[i] = upper;
else if (a[i] < lower)
b[i] = lower;
else
b[i] = a[i];
}
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
/*
set each entry to its clip value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> lower - the lower border
>> upper - the upper border
>> size - size of the data array
*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
return;
if (i < size) {
if (a[i] > upper)
b[i] = upper;
else if (a[i] < lower)
b[i] = lower;
else
b[i] = a[i];
}
}
/*
......@@ -88,12 +75,27 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
}
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
half lower1 = __float2half(lower);
half upper1 = __float2half(upper);
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT8) {
__int8 lower1 = (__int8)lower;
__int8 upper1 = (__int8)upper;
KernelClip << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, lower1, upper1, a->unitNum);
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */
template <class T>
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value */
void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int added
*/
#include "ScaleAndShift.cuh"
......@@ -34,9 +35,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
template<bool isUnitScale, bool isZeroShift>
template<class T, bool isUnitScale, bool isZeroShift>
__global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -56,28 +57,6 @@ void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shif
}
}
/*
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
This is for float16 computation
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
__global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if(i < size)
b[i] = __hadd(__hmul(a[i], scale), shift);
#else
if (i < size)
b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
#endif
}
/*
scale and shift all tensor entires
......@@ -108,20 +87,52 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale);
unsigned short shift2 = FloatToFloat16(shift);
__half * scaleft16p = (__half*)&scale2;
__half * shiftft16p = (__half*)&shift2;
KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
half scale1 = __float2half(scale);
half shift1 = __float2half(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<__half, true, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__half, true, false><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__half, false, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else
KernelScaleAndShift<__half, false, false> << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
}
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if (a->dataType == X_INT8) {
__int8 scale2 = __int8(scale);
__int8 shift2 = __int8(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<__int8, true, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__int8, true, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__int8, false, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<__int8, false, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
}
else{
ShowNTErrors("TODO!");
......
......@@ -37,11 +37,12 @@ gather indexed sub-tensors(cuda version)
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
*/
template <class T>
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
void KernelGather(T * sData, T * tData, int * sIndex, int indexSize, int stride)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -49,18 +50,18 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
if(i >= indexSize || offset >= stride)
if (i >= indexSize || offset >= stride)
return;
if(threadIdx.y == 0){
if (threadIdx.y == 0) {
sp[threadIdx.x] = sData + sIndex[i] * stride;
tp[threadIdx.x] = tData + i * stride;
}
__syncthreads();
DTYPE * s = sp[threadIdx.x];
DTYPE * t = tp[threadIdx.x];
T * s = sp[threadIdx.x];
T * t = tp[threadIdx.x];
t[offset] = s[offset];
}
......@@ -74,6 +75,10 @@ gather indexed sub-tensors(cuda version)
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{
CheckNTErrors((s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) ||
(s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16),
"The gather function does not support this datatype.");
int devID = s->devID;
XMem * mem = s->mem;
......@@ -91,9 +96,6 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = NULL;
if (srcIndex->devID < 0) {
......@@ -105,7 +107,20 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
else
sIndex = (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) {
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else if (s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16) {
half * sData = (half*)s->data;
half * tData = (half*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else {
//TODO!
ShowNTErrors("TODO!");
}
if (srcIndex->devID < 0) {
if(mem != NULL)
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/
#include "../../XDevice.h"
......@@ -503,6 +504,9 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors((input->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE) ||
(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT16),
"The reduce max function does not support this datatype.");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
......@@ -543,7 +547,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/
#include "../../XDevice.h"
......@@ -735,7 +736,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) {
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
......@@ -751,10 +752,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE){
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
......
......@@ -17,6 +17,7 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16 added
*/
#include "HardTanH.h"
......@@ -38,17 +39,18 @@ y = 1 if x > 1
>> y - output data array
>> size - size of input/output
*/
__global__
void KernelHardtanhCompute(DTYPE * x, DTYPE * y, int size)
template <class T>
__global__
void KernelHardtanhCompute(T * x, T * y, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
DTYPE p = x[i];
if(p > (DTYPE)1.0)
p = (DTYPE)1.0;
else if(p < (DTYPE)-1.0)
p = (DTYPE)-1.0;
if (i < size) {
T p = x[i];
if (p >(T)1.0)
p = (T)1.0;
else if (p < (T)-1.0)
p = (T)-1.0;
y[i] = p;
}
}
......@@ -63,25 +65,31 @@ y = 1 if x > 1
*/
void _CudaHardTanH(const XTensor * x, XTensor * y)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
CheckNTErrors((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16),
"The hard tanh activation function does not support this datatype.");
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else{
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((__half *)x->data, (__half *)y->data, x->unitNum);
}
else {
//TODO!
ShowNTErrors("TODO!");
}
BacktoCudaDev(x->devID, devIDBackup);
}
/*
......@@ -97,14 +105,15 @@ dy/dx = 1 if -1 <= x <= 1
>> x - x of the function
>> size - size of y/x
*/
template <class T>
__global__
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
void KernelHardtanhBackward(T * dedy, T * dedx, T * gold, T * y, T * x, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
DTYPE s = x[i];
if(s > (DTYPE)1.0 || s < (DTYPE)-1.0)
T s = x[i];
if(s > (T)1.0 || s < (T)-1.0)
dedx[i] = 0;
else
dedx[i] = dedy[i];
......@@ -134,21 +143,24 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16)),
"Input vectors are not in default type.");
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_CudaLossBackward(dedy, gold, y, lossName);
/* calculate dE/dy */
if (lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold);
else if (lossName != NOLOSS)
_CudaLossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
......@@ -156,11 +168,18 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
gold == NULL ? NULL : (DTYPE*)gold->data,
(DTYPE*)y->data, (DTYPE*)x->data,
x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
/* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]), dim3(blockSize[0])>>>
((half*)dedy->data,
(half*)dedx->data,
gold == NULL ? NULL : (half*)gold->data,
(half*)y->data, (half*)x->data,
x->unitNum);
}
BacktoCudaDev(x->devID, devIDBackup);
}
#endif
......
......@@ -50,121 +50,136 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
}
int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
int * dimSize = new int[x->order - 1];
for (int i = 0; i < x->order; i++) {
if (i < leadDim)
dimSize[i] = -x->dimSize[i];
else if (i > leadDim)
dimSize[i - 1] = -x->dimSize[i];
}
int * dimSize = new int[x->order - 1];
for (int i = 0; i < x->order; i++) {
if (i < leadDim)
dimSize[i] = -x->dimSize[i];
else if (i > leadDim)
dimSize[i - 1] = -x->dimSize[i];
}
XMem * mem = x->mem;
XTensor * max = NULL;
XTensor * sum = NULL;
XTensor * blockx = NULL;
XTensor * blocky = NULL;
XTensor * blockMax = NULL;
XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
_ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) {
if(leadDimRDI == 0){
blockSize = y->unitNum;
blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
}
else{
blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
}
XMem * mem = x->mem;
XTensor * max = NULL;
XTensor * sum = NULL;
XTensor * blockx = NULL;
XTensor * blocky = NULL;
XTensor * blockMax = NULL;
XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
_ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) {
if(leadDimRDI == 0){
blockSize = y->unitNum;
blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
}
else{
blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
}
}
for (int k = 0; k < blockNum; k++) {
int m = stride;
int n = dimensionSize;
for (int k = 0; k < blockNum; k++) {
int m = stride;
int n = dimensionSize;
if (x->devID < 0) {
DTYPE * ip = (DTYPE*)x->data + k * blockSize;
DTYPE * op = (DTYPE*)y->data + k * blockSize;
DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
if (x->devID < 0) {
for (int j = 0; j < m; j++) {
DTYPE sumValue = sp[j];
if (sumValue == 0) {
for (int i = 0; i < n; i++)
op[i * m + j] = 0;
}
else {
for (int i = 0; i < n; i++) {
DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
if (IsNAN(r))
r = LOGPROB_MIN;
if (IsINF(r))
r = LOGPROB_MIN;
op[i * m + j] = MAX(r, LOGPROB_MIN);
}
for (int j = 0; j < m; j++) {
DTYPE sumValue = sp[j];
if (sumValue == 0) {
for (int i = 0; i < n; i++)
op[i * m + j] = 0;
}
else {
for (int i = 0; i < n; i++) {
DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
if (IsNAN(r))
r = LOGPROB_MIN;
if (IsINF(r))
r = LOGPROB_MIN;
op[i * m + j] = MAX(r, LOGPROB_MIN);
}
}
}
}
else {
if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
DTYPE * ip = (DTYPE*)x->data + k * blockSize;
DTYPE * op = (DTYPE*)y->data + k * blockSize;
DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
blockx->data = ip;
blocky->data = op;
blockMax->data = mp;
blockSum->data = sp;
}
else {
half * ip = (half*)x->data + k * blockSize;
half * op = (half*)y->data + k * blockSize;
half * mp = (half*)max->data + k * blockSize / dimensionSize;
half * sp = (half*)sum->data + k * blockSize / dimensionSize;
blockx->data = ip;
blocky->data = op;
blockMax->data = mp;
blockSum->data = sp;
}
#ifdef USE_CUDA
if(leadDimRDI == 0)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
if (leadDimRDI == 0)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
blockx->data = NULL;
blocky->data = NULL;
blockMax->data = NULL;
blockSum->data = NULL;
}
blockx->data = NULL;
blocky->data = NULL;
blockMax->data = NULL;
blockSum->data = NULL;
}
}
DelTensorBuf(max);
DelTensorBuf(sum);
if (x->devID >= 0) {
delete blockx;
delete blocky;
delete blockMax;
delete blockSum;
}
DelTensorBuf(max);
DelTensorBuf(sum);
delete[] dimSize;
if (x->devID >= 0) {
delete blockx;
delete blocky;
delete blockMax;
delete blockSum;
}
else
ShowNTErrors("TODO!");
delete[] dimSize;
}
/*
......
......@@ -222,8 +222,9 @@ backward compuation for squared error (Cuda kernel)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
template <class T>
__global__
void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
void KernelLossBackwardSquaredError(T * dedy, T * t, T * y, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -242,8 +243,9 @@ backward compuation of blocks for squared error (Cuda kernel)
>> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy)
*/
template <class T>
__global__
void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
void KernelLossBackwardSquaredErrorBlock(T * dedy, T * t, T * y,
int blockSize, int begInBlock, int lenInBlock, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -265,8 +267,9 @@ backward compuation for cross entropy (Cuda kernel)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
template <class T>
__global__
void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
void KernelLossBackwardCrossEntropy(T * dedy, T * t, T * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i > stride * dimensionSize * blockNum)
......@@ -297,8 +300,9 @@ backward compuation for cross entropy (Cuda kernel)
>> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy)
*/
template <class T>
__global__
void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
void KernelLossBackwardCrossEntropyBlock(T * dedy, T * t, T * y,
int blockSize, int begInBlock, int lenInBlock, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -337,14 +341,8 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)),
"Tensor must be on the same device!");
CheckNTErrors((t->order > leadDim), "Illegal leading dimension!");
CheckNTErrors((t->dataType == DEFAULT_DTYPE &&
y->dataType == DEFAULT_DTYPE &&
dedy->dataType == DEFAULT_DTYPE),
"Input vectors are not in default type.");
CheckNTErrors((dedy->devID >= 0 && t->devID >= 0 && y->devID >= 0),
"The backward compuation must be performed on GPUs.");
CheckNTErrors((dedy->devID == t->devID && dedy->devID == y->devID),
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
......@@ -376,51 +374,105 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
dim3 blocks(cudaGridSize[0]);
dim3 threads(cudaBlockSize[0]);
DTYPE * tp = (DTYPE*)t->data;
DTYPE * yp = (DTYPE*)y->data;
DTYPE * dedyp = (DTYPE*)dedy->data;
int devIDBackup;
ProtectCudaDev(y->devID, devIDBackup);
/*
squared error
loss = sum_{i} 0.5*(t_i - y_i)^2, where t_i is the gold standard and y_i is the model output
dloss/dy_i = y_i - t_i
*/
if(LFName == SQUAREDERROR){
if(t->isSparse){
ShowNTErrors("TODO!");
}
else if(size == y->unitNum){
KernelLossBackwardSquaredError<<<blocks, threads>>>(dedyp, tp, yp, y->unitNum);
}
else{
KernelLossBackwardSquaredErrorBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
if (t->dataType == DEFAULT_DTYPE &&
y->dataType == DEFAULT_DTYPE &&
dedy->dataType == DEFAULT_DTYPE) {
DTYPE * tp = (DTYPE*)t->data;
DTYPE * yp = (DTYPE*)y->data;
DTYPE * dedyp = (DTYPE*)dedy->data;
int devIDBackup;
ProtectCudaDev(y->devID, devIDBackup);
/*
squared error
loss = sum_{i} 0.5*(t_i - y_i)^2, where t_i is the gold standard and y_i is the model output
dloss/dy_i = y_i - t_i
*/
if (LFName == SQUAREDERROR) {
if (t->isSparse) {
ShowNTErrors("TODO!");
}
else if (size == y->unitNum) {
KernelLossBackwardSquaredError << <blocks, threads >> >(dedyp, tp, yp, y->unitNum);
}
else {
KernelLossBackwardSquaredErrorBlock << <blocks, threads >> >(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
}
}
}
/*
cross entropy
loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions
dloss/dy_i = -t_i / y_i
*/
else if(LFName == CROSSENTROPY){
if(t->isSparse){
ShowNTErrors("TODO!");
}
else if(size == y->unitNum){
KernelLossBackwardCrossEntropy<<<blocks, threads>>>(dedyp, tp, yp, tBeg, tLen, yBeg, blockNum, stride, dimensionSize);
/*
cross entropy
loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions
dloss/dy_i = -t_i / y_i
*/
else if (LFName == CROSSENTROPY) {
if (t->isSparse) {
ShowNTErrors("TODO!");
}
else if (size == y->unitNum) {
KernelLossBackwardCrossEntropy << <blocks, threads >> >(dedyp, tp, yp, tBeg, tLen, yBeg, blockNum, stride, dimensionSize);
}
else {
KernelLossBackwardCrossEntropyBlock << <blocks, threads >> >(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
}
}
BacktoCudaDev(y->devID, devIDBackup);
}
else if (t->dataType == X_FLOAT16 &&
y->dataType == X_FLOAT16 &&
dedy->dataType == X_FLOAT16) {
half * tp = (half*)t->data;
half * yp = (half*)y->data;
half * dedyp = (half*)dedy->data;
int devIDBackup;
ProtectCudaDev(y->devID, devIDBackup);
/*
squared error
loss = sum_{i} 0.5*(t_i - y_i)^2, where t_i is the gold standard and y_i is the model output
dloss/dy_i = y_i - t_i
*/
if (LFName == SQUAREDERROR) {
if (t->isSparse) {
ShowNTErrors("TODO!");
}
else if (size == y->unitNum) {
KernelLossBackwardSquaredError << <blocks, threads >> >(dedyp, tp, yp, y->unitNum);
}
else {
KernelLossBackwardSquaredErrorBlock << <blocks, threads >> >(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
}
}
else{
KernelLossBackwardCrossEntropyBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
/*
cross entropy
loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions
dloss/dy_i = -t_i / y_i
*/
else if (LFName == CROSSENTROPY) {
if (t->isSparse) {
ShowNTErrors("TODO!");
}
else if (size == y->unitNum) {
KernelLossBackwardCrossEntropy << <blocks, threads >> >(dedyp, tp, yp, tBeg, tLen, yBeg, blockNum, stride, dimensionSize);
}
else {
KernelLossBackwardCrossEntropyBlock << <blocks, threads >> >(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
}
}
BacktoCudaDev(y->devID, devIDBackup);
}
else{
ShowNTErrors("TODO");
}
BacktoCudaDev(y->devID, devIDBackup);
}
#endif
......
......@@ -17,11 +17,13 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/
#include "../XTensor.h"
#include "../core/math/Clip.h"
#include "TClip.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -116,6 +118,251 @@ bool TestClip1()
#endif // USE_CUDA
}
/*
case 2: float16 test Clip function.
Set every entry to its clip value.
*/
bool TestClip2()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call clip function */
_Clip(&aHalfGPU, &bHalfGPU, -1.0, 1.0);
_ClipMe(&aMeHalfGPU, -1.0, 1.0);
bUserHalfGPU = Clip(aHalfGPU, -1.0, 1.0);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 test Clip function.
Set every entry to its clip value.
*/
bool TestClip3()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create int32 tensor */
XTensor aInt32GPU;
XTensor bInt32GPU;
XTensor aMeInt32GPU;
XTensor bUserInt32GPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to int32 */
aInt32GPU = ConvertDataType(*aGPU, X_INT);
aMeInt32GPU = ConvertDataType(*aMeGPU, X_INT);
bInt32GPU = ConvertDataType(*bGPU, X_INT);
/* call clip function */
_Clip(&aInt32GPU, &bInt32GPU, -1.0, 1.0);
_ClipMe(&aMeInt32GPU, -1.0, 1.0);
bUserInt32GPU = Clip(aInt32GPU, -1.0, 1.0);
/* convert data type from int32 to float */
_ConvertDataType(&bInt32GPU, bGPU);
_ConvertDataType(&aMeInt32GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt32GPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 test Clip function.
Set every entry to its clip value.
*/
bool TestClip4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create int8 tensor */
XTensor aInt8GPU;
XTensor bInt8GPU;
XTensor aMeInt8GPU;
XTensor bUserInt8GPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to int8 */
aInt8GPU = ConvertDataType(*aGPU, X_INT8);
aMeInt8GPU = ConvertDataType(*aMeGPU, X_INT8);
bInt8GPU = ConvertDataType(*bGPU, X_INT8);
/* call clip function */
_Clip(&aInt8GPU, &bInt8GPU, -1.0, 1.0);
_ClipMe(&aMeInt8GPU, -1.0, 1.0);
bUserInt8GPU = Clip(aInt8GPU, -1.0, 1.0);
/* convert data type from int8 to float */
_ConvertDataType(&bInt8GPU, bGPU);
_ConvertDataType(&aMeInt8GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -137,6 +384,36 @@ bool TestClip()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestClip2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestClip3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestClip4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 int8 added
*/
#include "TConvertDataType.h"
......@@ -26,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test ConvertDataType function.
In this case, the flaot32 data type is converted to int32 data type.
In this case, the float32 data type is converted to int32 data type.
*/
bool TestConvertDataType1()
......@@ -177,7 +178,7 @@ bool TestConvertDataType2()
/*
case 3: test ConvertDataType function.
In this case, the float data type is converted to float16 data type.
In this case, the float32 data type is converted to float16 data type.
*/
bool TestConvertDataType3()
{
......@@ -290,6 +291,130 @@ bool TestConvertDataType3()
#endif // USE_CUDA
}
/*
case 4: test ConvertDataType function.
In this case, the float32 data type is converted to int8 data type.
*/
bool TestConvertDataType4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, 2.0F},
{0.5F, 4.0F},
{5.0F, 6.0F} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * dGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
_ConvertDataType(cGPU, dGPU);
/* check results */
gpuTest = dGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete dGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ConvertDataType function.
In this case, the int data type is converted to int8 data type.
*/
bool TestConvertDataType5()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
int aData[3][2] = { {1, 2},
{0, 4},
{5, 6} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -331,6 +456,26 @@ bool TestConvertDataType()
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestConvertDataType4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestConvertDataType5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TDiv.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -148,6 +150,120 @@ bool TestDiv1()
#endif // USE_CUDA
}
/*
case 2: float16 element-wise division of two tensors
c(i) = a(i)/b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestDiv2()
{
/* a source tensor of size (2, 2) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 2;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {1.0F, 1.0F},
{4.0F, 9.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{0.5F, 0.3333F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU1;
XTensor sHalfGPU2;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
sHalfGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
/* call div function */
_Div(&sHalfGPU1, &sHalfGPU2, &tHalfGPU, 0, 0);
_DivMe(&tMeHalfGPU, &sHalfGPU2, 0, 0);
tUserHalfGPU = Div(sHalfGPU1, sHalfGPU2, 0);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tMeGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tUserGPU.CheckData(answer, tUnitNum, 1e-4F);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -169,6 +285,16 @@ bool TestDiv()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDiv2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,11 +17,13 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/
#include "TDivDim.h"
#include "../core/arithmetic/DivDim.h"
#include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -251,6 +253,207 @@ bool TestDivDim2()
#endif // USE_CUDA
}
/*
case 3: float16 tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2) = (2, 4), n = 0, alpha = 0.0.
*/
bool TestDivDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_DivDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_DivDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = DivDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: float16 tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2, 2) = (2, 4), n = 1.
*/
bool TestDivDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {0.0F, -1.0F, -2.0F, 3.0F},
{4.0F, -5.0F, -6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_DivDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_DivDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = DivDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -280,6 +483,24 @@ bool TestDivDim()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestDivDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestDivDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -364,6 +364,111 @@ bool TestGather3()
#endif // USE_CUDA
}
/*
case 4: float16 gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool TestGather4()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = { 0, 2 };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
indexGPU->SetData(srcIndex, indexSize);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call gather function */
_Gather(&sHalfGPU, &tHalfGPU, indexGPU);
tUserHalfGPU = Gather(sHalfGPU, *indexGPU);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -393,7 +498,7 @@ bool TestGather()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 2 test */
/* case 3 test */
caseFlag = TestGather3();
if (!caseFlag) {
returnFlag = false;
......@@ -402,6 +507,15 @@ bool TestGather()
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestGather4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -23,6 +23,7 @@
#define __TEST_GATHER_H__
#include "../core/movement/Gather.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -17,10 +17,12 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/
#include "../XTensor.h"
#include "THardTanH.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -222,6 +224,182 @@ bool TestHardTanH2()
#endif // USE_CUDA
}
/*
case 3: float16 test HardTanH function.
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
*/
bool TestHardTanH3()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* create float16 tensor */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* Initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call hardtanh function */
_HardTanH(&xHalfGPU, &yHalfGPU);
yUserHalfGPU = HardTanH(xHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: float16 test backward computation of HardTanH function.
dE/dx = dE/dy * dy/dx
hard tanh: y = 1 if x > 1
x if -1 <= x <= 1
-1 if x< -1
and dy/dx = 1 if -1 <= x <= 1
0 otherwise
In this case, lossName=SQUAREDERROR.
*/
bool TestHardTanH4()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE dedyAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, -2.0F, 0.0F} };
DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, 0.0F, -0.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor goldHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
goldGPU->SetData(goldData, unitNum);
yGPU->SetZeroAll();
dedyGPU->SetZeroAll();
dedxGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
goldHalfGPU = ConvertDataType(*goldGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call hardtanh function */
_HardTanH(&xHalfGPU, &yHalfGPU);
/* call hardtanhbackward function */
_HardTanHBackward(&goldHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, SQUAREDERROR);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedyHalfGPU, dedyGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check results */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F) &&
dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete goldGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -253,6 +431,26 @@ bool TestHardTanH()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestHardTanH3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestHardTanH4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,10 +17,12 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16 added
*/
#include "../XUtility.h"
#include "TLogSoftmax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -206,7 +208,7 @@ bool TestLogSoftmax2()
#endif // USE_CUDA
}
/*
/*
case 3: test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
......@@ -248,12 +250,12 @@ bool TestLogSoftmax3()
/* call LogSoftmax function */
_LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
......@@ -279,10 +281,10 @@ bool TestLogSoftmax3()
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-3F);
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-3F);
/* destroy variables */
delete x;
......@@ -311,6 +313,256 @@ bool TestLogSoftmax3()
#endif // USE_CUDA
}
/*
case 4: float16 test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
*/
bool TestLogSoftmax4()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F},
{-1.5435F, -1.3435F, -0.6435F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
yUserHalfGPU = LogSoftmax(xHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-2F) &&
yUserGPU.CheckData(answer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=CROSSENTROPY.
*/
bool TestLogSoftmax5()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[1][3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.4100F, -0.5553F, -0.8348F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, CROSSENTROPY);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=SQUAREDERROR
*/
bool TestLogSoftmax6()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[1][3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.4100F, -0.5553F, -0.8348F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, SQUAREDERROR);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -352,6 +604,36 @@ bool TestLogSoftmax()
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestLogSoftmax4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestLogSoftmax5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestLogSoftmax6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-07 float16/int8 added
*/
#include "TMatrixMul.h"
......@@ -507,6 +508,304 @@ bool TestMatrixMul4()
#endif // USE_CUDA
}
/*
case 5: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float16 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul5()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor halfSGPU1;
XTensor halfSGPU2;
XTensor halfTGPU;
XTensor halfTUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
halfSGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
halfSGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
halfTGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call MatrixMul function */
_MatrixMul(&halfSGPU1, X_NOTRANS, &halfSGPU2, X_NOTRANS, &halfTGPU);
halfTUserGPU = MatrixMul(halfSGPU1, X_NOTRANS, halfSGPU2, X_NOTRANS);
/* convert data type from float16 to float */
_ConvertDataType(&halfTGPU, tGPU);
tUserGPU = ConvertDataType(halfTUserGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul6()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor halfSGPU1;
XTensor halfSGPU2;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
halfSGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
halfSGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
/* call MatrixMul function */
_MatrixMul(&halfSGPU1, X_NOTRANS, &halfSGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMul(halfSGPU1, X_NOTRANS, halfSGPU2, X_NOTRANS, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: int8 matrix multiplication.
In this case, int8 a=(2, 3), int8 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul7()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1, 2, 3},
{-4, 5, 6} };
DTYPE sData2[3][2] = { {0, -1},
{1, 2},
{2, 1} };
DTYPE answer[2][2] = { {8, 6},
{17, 20} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int8 tensors */
XTensor int8SGPU1;
XTensor int8SGPU2;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to int8 */
int8SGPU1 = ConvertDataType(*sGPU1, X_INT8);
int8SGPU2 = ConvertDataType(*sGPU2, X_INT8);
/* call MatrixMul function */
_MatrixMul(&int8SGPU1, X_NOTRANS, &int8SGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMul(int8SGPU1, X_NOTRANS, int8SGPU2, X_NOTRANS, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
......@@ -556,6 +855,33 @@ bool TestMatrixMul()
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestMatrixMul5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestMatrixMul6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestMatrixMul7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -23,6 +23,7 @@
#define __TEST_MATRIXMUL_H__
#include "../core/arithmetic/MatrixMul.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -17,11 +17,13 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/
#include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
......@@ -248,6 +250,205 @@ bool TestMultiplyDim2()
#endif // USE_CUDA
}
/*
case 3: float16 tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool TestMultiplyDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: flaot16 tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool TestMultiplyDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[4] = {1.0F, -1.0F , 1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
{4.0F, -5.0F, 6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */
bool TestMultiplyDim()
{
......@@ -272,6 +473,24 @@ bool TestMultiplyDim()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestMultiplyDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestMultiplyDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/
#include "TNegate.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -191,6 +193,86 @@ bool TestNegate2()
#endif // USE_CUDA
}
/* case 3: float16 set every entry to its minus value */
bool TestNegate3()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{-3.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {-1.0F, 2.0F},
{3.0F, -4.0F},
{-5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call negate function */
_Negate(&aHalfGPU, &bHalfGPU);
_NegateMe(&aMeHalfGPU);
bUserHalfGPU = Negate(aHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
/* other cases */
/*
TODO!!
......@@ -222,6 +304,16 @@ bool TestNegate()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestNegate3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -86,8 +88,8 @@ bool TestReduceMax1()
tUser2 = ReduceMax(*s, 1);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -112,8 +114,8 @@ bool TestReduceMax1()
tUserGPU2 = ReduceMax(*sGPU, 1);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
......@@ -140,6 +142,113 @@ bool TestReduceMax1()
#endif // USE_CUDA
}
/*
case 2: float16 get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceMax2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a output tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
{4.0F, 1.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
DTYPE answer2[2] = {5.0F, 7.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type form float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducemax function */
_ReduceMax(&sHalfGPU, &tHalfGPU1, 0);
_ReduceMax(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceMax(sHalfGPU, 0);
tUserHalfGPU2 = ReduceMax(sHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -160,6 +269,15 @@ bool TestReduceMax()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMax2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,10 +17,12 @@
/*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/
#include "TReduceSum.h"
#include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -514,7 +516,6 @@ bool TestReduceSum5()
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
......@@ -607,6 +608,126 @@ bool TestReduceSum6()
}
/*
case 7: float16 test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceSum7()
{
/* a tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
DTYPE answer2[2] = {6.0F, 22.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor shiftHalfGPU1;
XTensor shiftHalfGPU2;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU1->SetZeroAll();
shiftGPU2->SetZeroAll();
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
shiftHalfGPU1 = ConvertDataType(*shiftGPU1, X_FLOAT16);
shiftHalfGPU2 = ConvertDataType(*shiftGPU2, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducesum function */
_ReduceSum(&sHalfGPU, &tHalfGPU1, 0);
_ReduceSum(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceSum(sHalfGPU, 0, shiftHalfGPU1);
tUserHalfGPU2 = ReduceSum(sHalfGPU, 1, shiftHalfGPU2);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete shiftGPU1;
delete shiftGPU2;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -672,6 +793,15 @@ bool TestReduceSum()
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestReduceSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/
#include "TScaleAndShift.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -113,6 +115,254 @@ bool TestScaleAndShift1()
#endif // USE_CUDA
}
/*
case 2: flaot16 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
{8.5F, 10.5F, 12.5F, 14.5F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call scaleandshift function */
_ScaleAndShift(&sHalfGPU, &tHalfGPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeHalfGPU, scaleFactor, shiftFactor);
tUserHalfGPU = ScaleAndShift(sHalfGPU, scaleFactor, shiftFactor);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift3()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int32 tensor */
XTensor sInt32GPU;
XTensor tInt32GPU;
XTensor tMeInt32GPU;
XTensor tUserInt32GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int32 */
sInt32GPU = ConvertDataType(*sGPU, X_INT);
tMeInt32GPU = ConvertDataType(*tMeGPU, X_INT);
tInt32GPU = ConvertDataType(tGPU, X_INT);
/* call scaleandshift function */
_ScaleAndShift(&sInt32GPU, &tInt32GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt32GPU, scaleFactor, shiftFactor);
tUserInt32GPU = ScaleAndShift(sInt32GPU, scaleFactor, shiftFactor);
/* convert data type from int32 to float */
_ConvertDataType(&tInt32GPU, tGPU);
_ConvertDataType(&tMeInt32GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt32GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift4()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int8 tensor */
XTensor sInt8GPU;
XTensor tInt8GPU;
XTensor tMeInt8GPU;
XTensor tUserInt8GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int8 */
sInt8GPU = ConvertDataType(*sGPU, X_INT8);
tMeInt8GPU = ConvertDataType(*tMeGPU, X_INT8);
tInt8GPU = ConvertDataType(*tGPU, X_INT8);
/* call scaleandshift function */
_ScaleAndShift(&sInt8GPU, &tInt8GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt8GPU, scaleFactor, shiftFactor);
tUserInt8GPU = ScaleAndShift(sInt8GPU, scaleFactor, shiftFactor);
/* convert data type from int8 to float */
_ConvertDataType(&tInt8GPU, tGPU);
_ConvertDataType(&tMeInt8GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -133,6 +383,33 @@ bool TestScaleAndShift()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestScaleAndShift2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestScaleAndShift3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestScaleAndShift4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -23,6 +23,7 @@
#define __TEST_SUM_H__
#include "../core/arithmetic/Sum.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -29,66 +29,66 @@ bool Test()
bool wrong = false;
XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
wrong = !TestAbsolute() || wrong;
//wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong;
wrong = !TestCompare() || wrong;
wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong;
wrong = !TestConvertDataType() || wrong;
wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong;
//wrong = !TestCompare() || wrong;
//wrong = !TestConcatenate() || wrong;
//wrong = !TestConcatenateSolely() || wrong;
//wrong = !TestCos() || wrong;
//wrong = !TestConvertDataType() || wrong;
//wrong = !TestCopyIndexed() || wrong;
//wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong;
wrong = !TestDivDim() || wrong;
wrong = !TestExp() || wrong;
wrong = !TestGather() || wrong;
wrong = !TestLog() || wrong;
wrong = !TestMatrixMul() || wrong;
wrong = !TestMatrixMul2D() || wrong;
wrong = !TestMatrixMul2DParallel() || wrong;
wrong = !TestMatrixMulBatched() || wrong;
wrong = !TestMerge() || wrong;
wrong = !TestMultiply() || wrong;
//wrong = !TestExp() || wrong;
//wrong = !TestGather() || wrong;
//wrong = !TestLog() || wrong;
//wrong = !TestMatrixMul() || wrong;
//wrong = !TestMatrixMul2D() || wrong;
//wrong = !TestMatrixMul2DParallel() || wrong;
//wrong = !TestMatrixMulBatched() || wrong;
//wrong = !TestMerge() || wrong;
//wrong = !TestMultiply() || wrong;
wrong = !TestMultiplyDim() || wrong;
wrong = !TestNegate() || wrong;
wrong = !TestNormalize() || wrong;
wrong = !TestPower() || wrong;
wrong = !TestReduceMax() || wrong;
wrong = !TestReduceMean() || wrong;
wrong = !TestReduceSum() || wrong;
wrong = !TestReduceSumAll() || wrong;
wrong = !TestReduceSumSquared() || wrong;
wrong = !TestReduceVariance() || wrong;
wrong = !TestRound() || wrong;
//wrong = !TestNegate() || wrong;
//wrong = !TestNormalize() || wrong;
//wrong = !TestPower() || wrong;
//wrong = !TestReduceMax() || wrong;
//wrong = !TestReduceMean() || wrong;
//wrong = !TestReduceSum() || wrong;
//wrong = !TestReduceSumAll() || wrong;
//wrong = !TestReduceSumSquared() || wrong;
//wrong = !TestReduceVariance() || wrong;
//wrong = !TestRound() || wrong;
wrong = !TestScaleAndShift() || wrong;
wrong = !TestSelect() || wrong;
wrong = !TestSetAscendingOrder() || wrong;
wrong = !TestSetData() || wrong;
wrong = !TestSign() || wrong;
wrong = !TestSin() || wrong;
wrong = !TestSort() || wrong;
wrong = !TestSplit() || wrong;
wrong = !TestSpread() || wrong;
wrong = !TestSub() || wrong;
//wrong = !TestSelect() || wrong;
//wrong = !TestSetAscendingOrder() || wrong;
//wrong = !TestSetData() || wrong;
//wrong = !TestSign() || wrong;
//wrong = !TestSin() || wrong;
//wrong = !TestSort() || wrong;
//wrong = !TestSplit() || wrong;
//wrong = !TestSpread() || wrong;
//wrong = !TestSub() || wrong;
wrong = !TestSum() || wrong;
wrong = !TestSumByColumnTV() || wrong;
wrong = !TestSumByColumnVT() || wrong;
wrong = !TestSumDim() || wrong;
wrong = !TestTan() || wrong;
wrong = !TestTranspose() || wrong;
//wrong = !TestSumByColumnTV() || wrong;
//wrong = !TestSumByColumnVT() || wrong;
//wrong = !TestSumDim() || wrong;
//wrong = !TestTan() || wrong;
//wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong;
wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong;
//wrong = !TestUnsqueeze() || wrong;
//wrong = !TestXMem() || wrong;
wrong = !TestCrossEntropy() || wrong;
//wrong = !TestCrossEntropy() || wrong;
//wrong = !TestDropout() || wrong;
wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong;
wrong = !TestLoss() || wrong;
wrong = !TestRectify() || wrong;
wrong = !TestSigmoid() || wrong;
wrong = !TestSoftmax() || wrong;
//wrong = !TestHardTanH() || wrong;
//wrong = !TestIdentity() || wrong;
//wrong = !TestLogSoftmax() || wrong;
//wrong = !TestLoss() || wrong;
//wrong = !TestRectify() || wrong;
//wrong = !TestSigmoid() || wrong;
//wrong = !TestSoftmax() || wrong;
/* other test */
/*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论