Commit 7da1bec1 by 单韦乔

clip

logsoftmax
multipltdim
negate
scaleandshift
parent 6a3d713a
...@@ -38,16 +38,6 @@ void SumDimTest(); ...@@ -38,16 +38,6 @@ void SumDimTest();
void SplitBackwardTest(); void SplitBackwardTest();
void MemTest(); void MemTest();
void xcTest(); void xcTest();
void ConvertDataTypeTest();
void ConvertDataTypeBackwardTest();
void SumFP16Test();
void GatherFP16Test();
void HardTanHFP16Test();
void ReduceMaxFP16Test();
void ReduceSumFP16Test();
void LogSoftmaxFP16Test();
void ClipFP16Test();
void ScaleAndShiftFP16Test();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
...@@ -66,291 +56,24 @@ int main(int argc, const char ** argv ) ...@@ -66,291 +56,24 @@ int main(int argc, const char ** argv )
//return 0; //return 0;
//Test(); //Test();
//return 0; //return 0;
//if (argc > 1 && !strcmp(argv[1], "-test"))
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
if (argc > 1 && !strcmp(argv[1], "-test"))
Test(); Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm")) //else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); // FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t")) //else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1); // TransformerMain(argc - 1, argv + 1);
else{ //else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n"); // fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n"); // fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n"); // fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n"); // fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
} //}
//_CrtDumpMemoryLeaks(); //_CrtDumpMemoryLeaks();
return 0; return 0;
} }
void ScaleAndShiftFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = ScaleAndShift(intA, 2, 0);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void ClipFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = Clip(intA, -1, 1);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void LogSoftmaxFP16Test() {
XTensor a;
XTensor halfA;
XTensor b;
XTensor halfB;
InitTensor3D(&a, 2, 2, 2, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
halfA = ConvertDataType(a, X_FLOAT16);
b = LogSoftmax(a, 1);
halfB = LogSoftmax(halfA, 1);
b.Dump(stderr, "sum:");
halfB.Dump(&halfB, stderr, "halfSum:");
}
void ReduceSumFP16Test()
{
XTensor a;
XTensor sum;
XTensor halfA;
XTensor halfSum;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
sum = ReduceSum(a, 1);
halfSum = ReduceSum(halfA, 1);
sum.Dump(stderr, "sum:");
halfSum.Dump(&halfSum, stderr, "halfSum:");
}
void ReduceMaxFP16Test()
{
XTensor a;
XTensor max;
XTensor halfA;
XTensor halfMax;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
max = ReduceMax(a, 1);
halfMax = ReduceMax(halfA, 1);
max.Dump(stderr, "max:");
halfMax.Dump(&halfMax, stderr, "halfMax:");
}
void HardTanHFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = HardTanH(a);
halfB = HardTanH(halfA);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void GatherFP16Test() {
XTensor a;
XTensor b;
XTensor srcIndex;
XTensor halfA;
XTensor halfB;
XTensor c;
InitTensor1D(&srcIndex, 2, X_INT, 0);
int m = 0;
int n = 1;
srcIndex.Set1DInt(m, 0);
srcIndex.Set1DInt(n, 1);
InitTensor2D(&a, 3, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
InitTensor2D(&halfB, 2, 2, X_FLOAT16, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
a.Dump(stderr, "a:");
_Gather(&a, &b, &srcIndex);
b.Dump(stderr, "b:");
_Gather(&halfA, &halfB, &srcIndex);
c = ConvertDataType(halfB, X_FLOAT);
c.Dump(stderr, "c:");
}
void SumFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = Sum(a, b, -0.4F);
halfB = Sum(halfA, halfB, -0.4F);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void ConvertDataTypeTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
XTensor halfa;
InitTensor2D(&halfa, 2, 2, X_FLOAT16, 0);
XTensor a1;
InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
halfa = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfa, X_FLOAT);
a1.Dump(stderr, "halfa:");
}
}
void ConvertDataTypeBackwardTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
a.SetDataRand(2.0F, 2.0F);
a.Dump(stderr, "a:");
XTensor halfA;
XTensor a1;
halfA = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfA, X_FLOAT);
a1.grad = NewTensor(&a1);
a1.grad->SetDataRand(3.0F, 3.0F);
a1.grad->Dump(stderr, "a1.grad:");
XNet testBackward;
printf("1");
testBackward.Backward(a1);
printf("2");
halfA.grad->Dump(stderr, "halfA.grad:");
a.grad->Dump(stderr, "a.grad:");
}
}
XTensor * stack(XList& list, int leadingDim) XTensor * stack(XList& list, int leadingDim)
{ {
size_t size = list.count; size_t size = list.count;
......
...@@ -249,6 +249,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -249,6 +249,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
//lossTensor = CrossEntropy(output, labelOnehot); //lossTensor = CrossEntropy(output, labelOnehot);
float prob = ReduceSumAll(lossTensor); float prob = ReduceSumAll(lossTensor);
printf("%f\n", prob);
exit(0);
DTYPE lossLocal = prob / wc; DTYPE lossLocal = prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
...@@ -293,7 +295,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -293,7 +295,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break; break;
} }
if (step % 100 == 0) { if (step % 10 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc)); elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
......
...@@ -48,7 +48,6 @@ ...@@ -48,7 +48,6 @@
#include "core/math/ScaleAndShift.h" #include "core/math/ScaleAndShift.h"
#include "core/getandset/SetData.h" #include "core/getandset/SetData.h"
#include "function/Identity.h" #include "function/Identity.h"
#include "core/getandset/ConvertDataType.h"
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -1765,22 +1764,9 @@ dump data to a file ...@@ -1765,22 +1764,9 @@ dump data to a file
*/ */
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose) void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
if (tensor->dataType == X_FLOAT) XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
{ _CopyValues(tensor, &a);
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem); a.Dump(file, label, n, beg, verbose);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else if (tensor->dataType == X_FLOAT16)
{
XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
_ConvertDataType(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else
{
ShowNTErrors("TO DO!");
}
} }
/* /*
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-05 float16 added
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
...@@ -35,9 +34,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i) ...@@ -35,9 +34,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i)
>> c - result data array >> c - result data array
>> size - size of c >> size - size of c
*/ */
template <class T>
__global__ __global__
void KernelDivElementWise(T * a, T * b, T * c, int size) void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -53,9 +51,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) ...@@ -53,9 +51,8 @@ division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
>> size - size of c >> size - size of c
>> alpha - the coefficient >> alpha - the coefficient
*/ */
template <class T>
__global__ __global__
void KernelDivElementWiseV2(T * a, T * b, T * c, int size, T alpha) void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -78,13 +75,13 @@ where |a_lead| means the size of the leading dimension of a ...@@ -78,13 +75,13 @@ where |a_lead| means the size of the leading dimension of a
>> ldSizeC - size of the leading dimension of c >> ldSizeC - size of the leading dimension of c
>> blockNum - number of blocks >> blockNum - number of blocks
*/ */
template<class T, int nonZeroAlpha> __global__ template<int nonZeroAlpha> __global__
void KernelDivElementWiseTensorDynamic(T * a, T * b, T * c, T alpha, void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum) int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{ {
__shared__ T* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -172,49 +169,17 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in ...@@ -172,49 +169,17 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) { if (alpha == 0) {
KernelDivElementWiseTensorDynamic<DTYPE, 0> << <blocks, threads >> > KernelDivElementWiseTensorDynamic<0> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum); stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
} }
else { else {
KernelDivElementWiseTensorDynamic<DTYPE, 1> << <blocks, threads >> > KernelDivElementWiseTensorDynamic<1> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum); stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
} }
} }
} }
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
int cudaGridSize[3];
int cudaBlockSize[3];
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelDivElementWise << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum);
else
KernelDivElementWiseV2 << <blocks, threads >> > ((__half *)a->data, (__half *)b->data, (__half *)c->data, c->unitNum, alpha1);
}
else {
GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelDivElementWiseTensorDynamic<__half, 0> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelDivElementWiseTensorDynamic<__half, 1> << <blocks, threads >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data, alpha1,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -29,16 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,16 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) */ /* division of two tensors in a element-wise manner c(i) = a(i)/b(i) */
template<class T> __global__ __global__
void KernelDivElementWise(T * a, T * b, T * c, int size); void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) */ /* division of two tensors in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) */
template<class T> __global__ __global__
void KernelDivElementWiseV2(T * a, T * b, T * c, int size, T alpha); void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
/* division of two tensors in a element-wise manner c(i) = a(i)/b(i)+ \alpha*c(i) */ /* division of two tensors in a element-wise manner c(i) = a(i)/b(i)+ \alpha*c(i) */
template<class T, int nonZeroAlpha>__global__ template<int nonZeroAlpha>__global__
void KernelDivElementWiseTensorDynamic(T * a, T * b, T * c, T alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum); void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise division of two tensors */ /* element-wise division of two tensors */
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
......
...@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner) XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{ {
CheckNTErrors(a && b && c, "Empty input tensors!"); CheckNTErrors(a && b && c, "Empty input tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Input tensors should have the same data type!");
CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2, CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
"Input tensors must have a order >= 2!"); "Input tensors must have a order >= 2!");
CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order") CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
...@@ -300,63 +302,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, ...@@ -300,63 +302,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
return c; return c;
} }
/*
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> dataType - indicates what datatype is needed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
TENSOR_DATA_TYPE dataType, DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c,
DTYPE alpha, XPRunner * parallelRunner, bool requireLink) DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
......
...@@ -59,9 +59,6 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x ...@@ -59,9 +59,6 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false); XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
......
...@@ -169,7 +169,7 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, ...@@ -169,7 +169,7 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
} }
if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
unsigned short temp = FloatToFloat16(alpha); unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp); half alpha1 = *((half *)&temp);
if (stride > 1) { if (stride > 1) {
......
...@@ -28,6 +28,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,6 +28,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha);
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha);
/* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a, /* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting (cuda version) */ i.e., a is multiplied with b by broadcasting (cuda version) */
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0); void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0);
......
...@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set each entry to its negtive value (CUDA Kernel) */ /* set each entry to its negtive value (CUDA Kernel) */
template <class T>
__global__ __global__
void KernelNegate(DTYPE * a, DTYPE * b, int size); void KernelNegate(T * a, T * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */ /* set each entry to its negtive value */
void _CudaNegate(const XTensor * a, XTensor * b); void _CudaNegate(const XTensor * a, XTensor * b);
......
...@@ -32,6 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,6 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
template <class T> __global__ template <class T> __global__
void KernelADD(T * a, T * b, T * c, int size, T beta = (T)1.0); void KernelADD(T * a, T * b, T * c, int size, T beta = (T)1.0);
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta);
/* tensor summation c = a + b * \beta (cuda version) */ /* tensor summation c = a + b * \beta (cuda version) */
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
...@@ -44,6 +44,22 @@ void KernelFloatToInt(float * inputData, int * outputData, int size); ...@@ -44,6 +44,22 @@ void KernelFloatToInt(float * inputData, int * outputData, int size);
__global__ __global__
void KernelIntToFloat(int * inputData, float * outputData, int size); void KernelIntToFloat(int * inputData, float * outputData, int size);
/* convert data type from X_FLOAT to X_INT8 (CUDA Kernel) */
__global__
void KernelFloatToInt8(float * inputData, __int8 * outputData, int size);
/* convert data type from X_INT8 to X_FLOAT (CUDA Kernel) */
__global__
void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size);
/* convert data type from X_INT to X_INT8 (CUDA Kernel) */
__global__
void KernelIntToInt8(int * inputData, __int8 * outputData, int size);
/* convert data type from X_INT8 to X_INT (CUDA Kernel) */
__global__
void KernelInt8ToInt(__int8 * inputData, int * outputData, int size);
/* convert data type */ /* convert data type */
void _CudaConvertDataType(const XTensor * input, XTensor * output); void _CudaConvertDataType(const XTensor * input, XTensor * output);
......
...@@ -37,12 +37,11 @@ gather indexed sub-tensors(cuda version) ...@@ -37,12 +37,11 @@ gather indexed sub-tensors(cuda version)
>> indexSize - the size of the srcIndex >> indexSize - the size of the srcIndex
>> stride - stride of a data block >> stride - stride of a data block
*/ */
template <class T>
__global__ __global__
void KernelGather(T * sData, T * tData, int * sIndex, int indexSize, int stride) void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
{ {
__shared__ T * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */ /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -50,18 +49,18 @@ void KernelGather(T * sData, T * tData, int * sIndex, int indexSize, int stride) ...@@ -50,18 +49,18 @@ void KernelGather(T * sData, T * tData, int * sIndex, int indexSize, int stride)
/* offset in each block */ /* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y; int offset = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= indexSize || offset >= stride) if(i >= indexSize || offset >= stride)
return; return;
if (threadIdx.y == 0) { if(threadIdx.y == 0){
sp[threadIdx.x] = sData + sIndex[i] * stride; sp[threadIdx.x] = sData + sIndex[i] * stride;
tp[threadIdx.x] = tData + i * stride; tp[threadIdx.x] = tData + i * stride;
} }
__syncthreads(); __syncthreads();
T * s = sp[threadIdx.x]; DTYPE * s = sp[threadIdx.x];
T * t = tp[threadIdx.x]; DTYPE * t = tp[threadIdx.x];
t[offset] = s[offset]; t[offset] = s[offset];
} }
...@@ -75,10 +74,6 @@ gather indexed sub-tensors(cuda version) ...@@ -75,10 +74,6 @@ gather indexed sub-tensors(cuda version)
*/ */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
{ {
CheckNTErrors((s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) ||
(s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16),
"The gather function does not support this datatype.");
int devID = s->devID; int devID = s->devID;
XMem * mem = s->mem; XMem * mem = s->mem;
...@@ -96,6 +91,9 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -96,6 +91,9 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndex = NULL; int * sIndex = NULL;
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
...@@ -107,20 +105,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -107,20 +105,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
else else
sIndex = (int *)srcIndex->data; sIndex = (int *)srcIndex->data;
if (s->dataType == DEFAULT_DTYPE && t->dataType == DEFAULT_DTYPE) { KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else if (s->dataType == X_FLOAT16 && t->dataType == X_FLOAT16) {
half * sData = (half*)s->data;
half * tData = (half*)t->data;
KernelGather<<<blocks, threads>>>(sData, tData, sIndex, indexSize, stride);
}
else {
//TODO!
ShowNTErrors("TODO!");
}
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if(mem != NULL)
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
...@@ -504,9 +503,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -504,9 +503,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!"); CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!"); CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors((input->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE) ||
(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT16),
"The reduce max function does not support this datatype.");
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
...@@ -547,7 +543,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -547,7 +543,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup; int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) { if (stride == 1 && blockNum >= 10) {
dim3 grids; dim3 grids;
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-03 float16 added
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
...@@ -736,7 +735,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -736,7 +735,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup; int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) { if (stride == 1 && blockNum >= 10) {
dim3 grids; dim3 grids;
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
...@@ -752,7 +751,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -752,7 +751,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum, sp, power, isExp); strideNum, blockNum, sp, power, isExp);
} }
} }
else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE){ else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block; dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum); discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-04 float16 added
*/ */
#include "HardTanH.h" #include "HardTanH.h"
...@@ -39,18 +38,17 @@ y = 1 if x > 1 ...@@ -39,18 +38,17 @@ y = 1 if x > 1
>> y - output data array >> y - output data array
>> size - size of input/output >> size - size of input/output
*/ */
template <class T> __global__
__global__ void KernelHardtanhCompute(DTYPE * x, DTYPE * y, int size)
void KernelHardtanhCompute(T * x, T * y, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) { if (i < size){
T p = x[i]; DTYPE p = x[i];
if (p >(T)1.0) if(p > (DTYPE)1.0)
p = (T)1.0; p = (DTYPE)1.0;
else if (p < (T)-1.0) else if(p < (DTYPE)-1.0)
p = (T)-1.0; p = (DTYPE)-1.0;
y[i] = p; y[i] = p;
} }
} }
...@@ -65,31 +63,25 @@ y = 1 if x > 1 ...@@ -65,31 +63,25 @@ y = 1 if x > 1
*/ */
void _CudaHardTanH(const XTensor * x, XTensor * y) void _CudaHardTanH(const XTensor * x, XTensor * y)
{ {
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors."); if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
CheckNTErrors((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16),
"The hard tanh activation function does not support this datatype.");
int gridSize[3], blockSize[3]; CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize); int gridSize[3], blockSize[3];
int devIDBackup; GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum); KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
} }
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) { else{
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((__half *)x->data, (__half *)y->data, x->unitNum);
}
else {
//TODO!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
BacktoCudaDev(x->devID, devIDBackup);
} }
/* /*
......
...@@ -83,7 +83,7 @@ void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, in ...@@ -83,7 +83,7 @@ void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, in
int key = i * colNum + j; int key = i * colNum + j;
if (dataType == X_FLOAT) { if (dataType == X_FLOAT) {
DTYPE r = log((DTYPE)exp(x[key] - inputMax[threadIdx.x]) / (DTYPE)inputSum[threadIdx.x]); DTYPE r = log((DTYPE)exp((DTYPE)(x[key] - inputMax[threadIdx.x])) / (DTYPE)inputSum[threadIdx.x]);
if (isnan(r)) if (isnan(r))
r = LOGPROB_MIN; r = LOGPROB_MIN;
...@@ -137,7 +137,7 @@ void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, in ...@@ -137,7 +137,7 @@ void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, in
if (i < rowNum && j < colNum) { if (i < rowNum && j < colNum) {
int key = i * colNum + j; int key = i * colNum + j;
if (dataType == X_FLOAT) { if (dataType == X_FLOAT) {
DTYPE r = log((DTYPE)exp(x[key] - inputMax[threadIdx.y]) / (DTYPE)inputSum[threadIdx.y]); DTYPE r = log((DTYPE)exp((DTYPE)(x[key] - inputMax[threadIdx.y])) / (DTYPE)inputSum[threadIdx.y]);
if (isnan(r)) if (isnan(r))
r = LOGPROB_MIN; r = LOGPROB_MIN;
...@@ -247,10 +247,10 @@ void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossN ...@@ -247,10 +247,10 @@ void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossN
if (i < size) { if (i < size) {
/* dE/dx_j = exp(y_j) */ /* dE/dx_j = exp(y_j) */
if (lossName == CROSSENTROPY) if (lossName == CROSSENTROPY)
dedx[i] = exp(y[i]); dedx[i] = exp(((DTYPE)y[i]));
/* dE/dx_j = exp(y_j) */ /* dE/dx_j = exp(y_j) */
else if (lossName == SQUAREDERROR) else if (lossName == SQUAREDERROR)
dedx[i] = exp(y[i]); dedx[i] = exp(((DTYPE)y[i]));
else if (lossName == ONEHOTERROR) else if (lossName == ONEHOTERROR)
dedx[i] = 0; dedx[i] = 0;
else else
...@@ -283,13 +283,13 @@ void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x, ...@@ -283,13 +283,13 @@ void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x,
DTYPE r = 0; DTYPE r = 0;
/* dE/ds_j = exp(y_j) */ /* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY) if (lossName == CROSSENTROPY)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]); r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
/* dE/ds_j = exp(y_j) */ /* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR) else if (lossName == SQUAREDERROR)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]); r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
else if (lossName == ONEHOTERROR) { else if (lossName == ONEHOTERROR) {
if ((DTYPE)gold[i] == 1.0) if ((DTYPE)gold[i] == 1.0)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]); r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
else else
r = 0; r = 0;
} }
...@@ -366,7 +366,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T ...@@ -366,7 +366,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T
else if (lossName == ONEHOTERROR) { else if (lossName == ONEHOTERROR) {
int offset = colNum * ni + mi; int offset = colNum * ni + mi;
if (value == 1.0F) if (value == 1.0F)
dedx[offset] += (-value + exp(y[offset])); dedx[offset] += (-value + exp(((DTYPE)y[offset])));
//dedx[offset] += -value * 0.005; //dedx[offset] += -value * 0.005;
} }
} }
......
...@@ -32,9 +32,30 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,9 +32,30 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */ /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
void _CudaLogSoftmax(const XTensor * input, XTensor * output, int leadDim); void _CudaLogSoftmax(const XTensor * input, XTensor * output, int leadDim);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, int colNum);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, int colNum);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */ /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max); void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
template <class T>
__global__
void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossName);
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x, int size, LOSS_FUNCTION_NAME lossName);
template <class T>
__global__
void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T * y, T * x,
int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/math/Clip.h" #include "../core/math/Clip.h"
#include "TClip.h" #include "TClip.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -116,6 +117,249 @@ bool TestClip1() ...@@ -116,6 +117,249 @@ bool TestClip1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 test Clip function.
Set every entry to its clip value.
*/
bool TestClip2()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call clip function */
_Clip(&aHalfGPU, &bHalfGPU, -1.0, 1.0);
_ClipMe(&aMeHalfGPU, -1.0, 1.0);
bUserHalfGPU = Clip(aHalfGPU, -1.0, 1.0);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 test Clip function.
Set every entry to its clip value.
*/
bool TestClip3()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create int32 tensor */
XTensor aInt32GPU;
XTensor bInt32GPU;
XTensor aMeInt32GPU;
XTensor bUserInt32GPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to int32 */
aInt32GPU = ConvertDataType(*aGPU, X_INT);
aMeInt32GPU = ConvertDataType(*aMeGPU, X_INT);
bInt32GPU = ConvertDataType(*bGPU, X_INT);
/* call clip function */
_Clip(&aInt32GPU, &bInt32GPU, -1.0, 1.0);
_ClipMe(&aMeInt32GPU, -1.0, 1.0);
bUserInt32GPU = Clip(aInt32GPU, -1.0, 1.0);
/* convert data type from int32 to float */
_ConvertDataType(&bInt32GPU, bGPU);
_ConvertDataType(&aMeInt32GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt32GPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 test Clip function.
Set every entry to its clip value.
*/
bool TestClip4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create int8 tensor */
XTensor aInt8GPU;
XTensor bInt8GPU;
XTensor aMeInt8GPU;
XTensor bUserInt8GPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to int8 */
aInt8GPU = ConvertDataType(*aGPU, X_INT8);
aMeInt8GPU = ConvertDataType(*aMeGPU, X_INT8);
bInt8GPU = ConvertDataType(*bGPU, X_INT8);
/* call clip function */
_Clip(&aInt8GPU, &bInt8GPU, -1.0, 1.0);
_ClipMe(&aMeInt8GPU, -1.0, 1.0);
bUserInt8GPU = Clip(aInt8GPU, -1.0, 1.0);
/* convert data type from int8 to float */
_ConvertDataType(&bInt8GPU, bGPU);
_ConvertDataType(&aMeInt8GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -137,6 +381,36 @@ bool TestClip() ...@@ -137,6 +381,36 @@ bool TestClip()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestClip2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestClip3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestClip4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 int8 added
*/ */
#include "TConvertDataType.h" #include "TConvertDataType.h"
...@@ -27,7 +26,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -27,7 +26,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: test ConvertDataType function. case 1: test ConvertDataType function.
In this case, the float32 data type is converted to int32 data type. In this case, the flaot32 data type is converted to int32 data type.
*/ */
bool TestConvertDataType1() bool TestConvertDataType1()
...@@ -178,7 +177,7 @@ bool TestConvertDataType2() ...@@ -178,7 +177,7 @@ bool TestConvertDataType2()
/* /*
case 3: test ConvertDataType function. case 3: test ConvertDataType function.
In this case, the float32 data type is converted to float16 data type. In this case, the float data type is converted to float16 data type.
*/ */
bool TestConvertDataType3() bool TestConvertDataType3()
{ {
...@@ -291,130 +290,6 @@ bool TestConvertDataType3() ...@@ -291,130 +290,6 @@ bool TestConvertDataType3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 4: test ConvertDataType function.
In this case, the float32 data type is converted to int8 data type.
*/
bool TestConvertDataType4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, 2.0F},
{0.5F, 4.0F},
{5.0F, 6.0F} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * dGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
_ConvertDataType(cGPU, dGPU);
/* check results */
gpuTest = dGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete dGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ConvertDataType function.
In this case, the int data type is converted to int8 data type.
*/
bool TestConvertDataType5()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
int aData[3][2] = { {1, 2},
{0, 4},
{5, 6} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -456,26 +331,6 @@ bool TestConvertDataType() ...@@ -456,26 +331,6 @@ bool TestConvertDataType()
else else
XPRINT(0, stdout, ">> case 3 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestConvertDataType4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestConvertDataType5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,11 +17,9 @@ ...@@ -17,11 +17,9 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TDiv.h" #include "TDiv.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -150,120 +148,6 @@ bool TestDiv1() ...@@ -150,120 +148,6 @@ bool TestDiv1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 element-wise division of two tensors
c(i) = a(i)/b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestDiv2()
{
/* a source tensor of size (2, 2) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 2;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {1.0F, 1.0F},
{4.0F, 9.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{0.5F, 0.3333F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU1;
XTensor sHalfGPU2;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
sHalfGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
/* call div function */
_Div(&sHalfGPU1, &sHalfGPU2, &tHalfGPU, 0, 0);
_DivMe(&tMeHalfGPU, &sHalfGPU2, 0, 0);
tUserHalfGPU = Div(sHalfGPU1, sHalfGPU2, 0);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tMeGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tUserGPU.CheckData(answer, tUnitNum, 1e-4F);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -285,16 +169,6 @@ bool TestDiv() ...@@ -285,16 +169,6 @@ bool TestDiv()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDiv2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -364,111 +364,6 @@ bool TestGather3() ...@@ -364,111 +364,6 @@ bool TestGather3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 4: float16 gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool TestGather4()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = { 0, 2 };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
indexGPU->SetData(srcIndex, indexSize);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call gather function */
_Gather(&sHalfGPU, &tHalfGPU, indexGPU);
tUserHalfGPU = Gather(sHalfGPU, *indexGPU);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -498,7 +393,7 @@ bool TestGather() ...@@ -498,7 +393,7 @@ bool TestGather()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */ /* case 2 test */
caseFlag = TestGather3(); caseFlag = TestGather3();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
...@@ -507,15 +402,6 @@ bool TestGather() ...@@ -507,15 +402,6 @@ bool TestGather()
else else
XPRINT(0, stdout, ">> case 3 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestGather4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#define __TEST_GATHER_H__ #define __TEST_GATHER_H__
#include "../core/movement/Gather.h" #include "../core/movement/Gather.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -17,12 +17,10 @@ ...@@ -17,12 +17,10 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "../XTensor.h" #include "../XTensor.h"
#include "THardTanH.h" #include "THardTanH.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -224,80 +222,6 @@ bool TestHardTanH2() ...@@ -224,80 +222,6 @@ bool TestHardTanH2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: float16 test HardTanH function.
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
*/
bool TestHardTanH3()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* create float16 tensor */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* Initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call hardtanh function */
_HardTanH(&xHalfGPU, &yHalfGPU);
yUserHalfGPU = HardTanH(xHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -329,16 +253,6 @@ bool TestHardTanH() ...@@ -329,16 +253,6 @@ bool TestHardTanH()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestHardTanH3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "../XUtility.h" #include "../XUtility.h"
...@@ -313,82 +312,260 @@ bool TestLogSoftmax3() ...@@ -313,82 +312,260 @@ bool TestLogSoftmax3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */
/*
TODO!!
*/
/* /*
case 4: float16 test LogSoftmax function. case 4: float16 test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i}) LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
*/ */
bool TestLogSoftmax4() bool TestLogSoftmax4()
{ {
/* a tensor of size (2, 3) */ /* a tensor of size (2, 3) */
int order = 2; int order = 2;
int * dimSize = new int[order]; int * dimSize = new int[order];
dimSize[0] = 2; dimSize[0] = 2;
dimSize[1] = 3; dimSize[1] = 3;
int unitNum = 1; int unitNum = 1;
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} }; {0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F}, DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F},
{-1.5435F, -1.3435F, -0.6435F} }; {-1.5435F, -1.3435F, -0.6435F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensors */ /* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU; XTensor yUserGPU;
/* create float16 tensors */ /* create float16 tensors */
XTensor xHalfGPU; XTensor xHalfGPU;
XTensor yHalfGPU; XTensor yHalfGPU;
XTensor yUserHalfGPU; XTensor yUserHalfGPU;
/* initialize variables */ /* initialize variables */
xGPU->SetData(xData, unitNum); xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll(); yGPU->SetZeroAll();
/* convert data type from float to float16 */ /* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16); xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16); yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call logsoftmax function */ /* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1); _LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
yUserHalfGPU = LogSoftmax(xHalfGPU, 1); yUserHalfGPU = LogSoftmax(xHalfGPU, 1);
/* convert data type from float16 to float */ /* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU); _ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yHalfGPU, X_FLOAT); yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-1F) && yUserGPU.CheckData(answer, unitNum, 1e-1F); gpuTest = yGPU->CheckData(answer, unitNum, 1e-2F) &&
yUserGPU.CheckData(answer, unitNum, 1e-2F);
/* destroy variables */ /* destroy variables */
delete xGPU; delete xGPU;
delete yGPU; delete yGPU;
delete[] dimSize; delete[] dimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA
}
/*
case 5: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=CROSSENTROPY.
*/
bool TestLogSoftmax5()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = { 0.0F, 1.0F, 2.0F };
DTYPE gData[1][3] = { 0.5F, 0.8F, 1.5F };
DTYPE yAnswer[1][3] = { -2.4076F, -1.4076F, -0.4076F };
DTYPE dedxAnswer[1][3] = { -0.4100F, -0.5553F, -0.8348F };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, CROSSENTROPY);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=SQUAREDERROR
*/
bool TestLogSoftmax6()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = { 0.0F, 1.0F, 2.0F };
DTYPE gData[1][3] = { 0.5F, 0.8F, 1.5F };
DTYPE yAnswer[1][3] = { -2.4076F, -1.4076F, -0.4076F };
DTYPE dedxAnswer[1][3] = { -0.4100F, -0.5553F, -0.8348F };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, SQUAREDERROR);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */
/*
TODO!!
*/
/* test for LogSoftmax Function */ /* test for LogSoftmax Function */
bool TestLogSoftmax() bool TestLogSoftmax()
...@@ -426,15 +603,35 @@ bool TestLogSoftmax() ...@@ -426,15 +603,35 @@ bool TestLogSoftmax()
else else
XPRINT(0, stdout, ">> case 3 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */ /* case 4 test */
caseFlag = TestLogSoftmax4(); caseFlag = TestLogSoftmax4();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n"); XPRINT(0, stdout, ">> case 4 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 4 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestLogSoftmax5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestLogSoftmax6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */ /* other cases test */
/* /*
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#define __TEST_MATRIXMUL_H__ #define __TEST_MATRIXMUL_H__
#include "../core/arithmetic/MatrixMul.h" #include "../core/arithmetic/MatrixMul.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "TMultiplyDim.h" #include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h" #include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
...@@ -248,6 +249,203 @@ bool TestMultiplyDim2() ...@@ -248,6 +249,203 @@ bool TestMultiplyDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: float16 tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool TestMultiplyDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = { 1.0F, -1.0F };
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: flaot16 tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool TestMultiplyDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[4] = { 1.0F, -1.0F , 1.0F, -1.0F };
DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
{4.0F, -5.0F, 6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */ /* test for MultiplyDim Function */
bool TestMultiplyDim() bool TestMultiplyDim()
{ {
...@@ -272,6 +470,24 @@ bool TestMultiplyDim() ...@@ -272,6 +470,24 @@ bool TestMultiplyDim()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestMultiplyDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestMultiplyDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "TNegate.h" #include "TNegate.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -191,6 +192,84 @@ bool TestNegate2() ...@@ -191,6 +192,84 @@ bool TestNegate2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: float16 set every entry to its minus value */
bool TestNegate3()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{-3.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {-1.0F, 2.0F},
{3.0F, -4.0F},
{-5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call negate function */
_Negate(&aHalfGPU, &bHalfGPU);
_NegateMe(&aMeHalfGPU);
bUserHalfGPU = Negate(aHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -222,6 +301,16 @@ bool TestNegate() ...@@ -222,6 +301,16 @@ bool TestNegate()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestNegate3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,11 +17,9 @@ ...@@ -17,11 +17,9 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TReduceMax.h" #include "TReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -88,8 +86,8 @@ bool TestReduceMax1() ...@@ -88,8 +86,8 @@ bool TestReduceMax1()
tUser2 = ReduceMax(*s, 1); tUser2 = ReduceMax(*s, 1);
/* check results */ /* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) && cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2); && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -114,8 +112,8 @@ bool TestReduceMax1() ...@@ -114,8 +112,8 @@ bool TestReduceMax1()
tUserGPU2 = ReduceMax(*sGPU, 1); tUserGPU2 = ReduceMax(*sGPU, 1);
/* check results */ /* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) && gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2); && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */ /* destroy variables */
delete s; delete s;
...@@ -142,113 +140,6 @@ bool TestReduceMax1() ...@@ -142,113 +140,6 @@ bool TestReduceMax1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceMax2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a output tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
{4.0F, 1.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
DTYPE answer2[2] = {5.0F, 7.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type form float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducemax function */
_ReduceMax(&sHalfGPU, &tHalfGPU1, 0);
_ReduceMax(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceMax(sHalfGPU, 0);
tUserHalfGPU2 = ReduceMax(sHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -269,15 +160,6 @@ bool TestReduceMax() ...@@ -269,15 +160,6 @@ bool TestReduceMax()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMax2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,12 +17,10 @@ ...@@ -17,12 +17,10 @@
/* /*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TReduceSum.h" #include "TReduceSum.h"
#include "../core/getandset/SetData.h" #include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -516,6 +514,7 @@ bool TestReduceSum5() ...@@ -516,6 +514,7 @@ bool TestReduceSum5()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 6: test ReduceSum function. case 6: test ReduceSum function.
Sum the items along a dimension of the tensor. Sum the items along a dimension of the tensor.
...@@ -608,126 +607,6 @@ bool TestReduceSum6() ...@@ -608,126 +607,6 @@ bool TestReduceSum6()
} }
/*
case 7: float16 test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceSum7()
{
/* a tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
DTYPE answer2[2] = {6.0F, 22.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor shiftHalfGPU1;
XTensor shiftHalfGPU2;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU1->SetZeroAll();
shiftGPU2->SetZeroAll();
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
shiftHalfGPU1 = ConvertDataType(*shiftGPU1, X_FLOAT16);
shiftHalfGPU2 = ConvertDataType(*shiftGPU2, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducesum function */
_ReduceSum(&sHalfGPU, &tHalfGPU1, 0);
_ReduceSum(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceSum(sHalfGPU, 0, shiftHalfGPU1);
tUserHalfGPU2 = ReduceSum(sHalfGPU, 1, shiftHalfGPU2);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete shiftGPU1;
delete shiftGPU2;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -793,15 +672,6 @@ bool TestReduceSum() ...@@ -793,15 +672,6 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 6 passed!\n"); XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestReduceSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "TScaleAndShift.h" #include "TScaleAndShift.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -113,6 +114,252 @@ bool TestScaleAndShift1() ...@@ -113,6 +114,252 @@ bool TestScaleAndShift1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: flaot16 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
{8.5F, 10.5F, 12.5F, 14.5F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call scaleandshift function */
_ScaleAndShift(&sHalfGPU, &tHalfGPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeHalfGPU, scaleFactor, shiftFactor);
tUserHalfGPU = ScaleAndShift(sHalfGPU, scaleFactor, shiftFactor);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift3()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int32 tensor */
XTensor sInt32GPU;
XTensor tInt32GPU;
XTensor tMeInt32GPU;
XTensor tUserInt32GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int32 */
sInt32GPU = ConvertDataType(*sGPU, X_INT);
tMeInt32GPU = ConvertDataType(*tMeGPU, X_INT);
tInt32GPU = ConvertDataType(tGPU, X_INT);
/* call scaleandshift function */
_ScaleAndShift(&sInt32GPU, &tInt32GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt32GPU, scaleFactor, shiftFactor);
tUserInt32GPU = ScaleAndShift(sInt32GPU, scaleFactor, shiftFactor);
/* convert data type from int32 to float */
_ConvertDataType(&tInt32GPU, tGPU);
_ConvertDataType(&tMeInt32GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt32GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift4()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int8 tensor */
XTensor sInt8GPU;
XTensor tInt8GPU;
XTensor tMeInt8GPU;
XTensor tUserInt8GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int8 */
sInt8GPU = ConvertDataType(*sGPU, X_INT8);
tMeInt8GPU = ConvertDataType(*tMeGPU, X_INT8);
tInt8GPU = ConvertDataType(*tGPU, X_INT8);
/* call scaleandshift function */
_ScaleAndShift(&sInt8GPU, &tInt8GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt8GPU, scaleFactor, shiftFactor);
tUserInt8GPU = ScaleAndShift(sInt8GPU, scaleFactor, shiftFactor);
/* convert data type from int8 to float */
_ConvertDataType(&tInt8GPU, tGPU);
_ConvertDataType(&tMeInt8GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -133,6 +380,33 @@ bool TestScaleAndShift() ...@@ -133,6 +380,33 @@ bool TestScaleAndShift()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestScaleAndShift2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestScaleAndShift3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestScaleAndShift4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -29,66 +29,66 @@ bool Test() ...@@ -29,66 +29,66 @@ bool Test()
bool wrong = false; bool wrong = false;
XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n"); XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
//wrong = !TestAbsolute() || wrong; // wrong = !TestAbsolute() || wrong;
//wrong = !TestClip() || wrong; //wrong = !TestClip() || wrong;
//wrong = !TestCompare() || wrong; // wrong = !TestCompare() || wrong;
//wrong = !TestConcatenate() || wrong; // wrong = !TestConcatenate() || wrong;
//wrong = !TestConcatenateSolely() || wrong; // wrong = !TestConcatenateSolely() || wrong;
//wrong = !TestCos() || wrong; // wrong = !TestCos() || wrong;
//wrong = !TestConvertDataType() || wrong; // wrong = !TestConvertDataType() || wrong;
//wrong = !TestCopyIndexed() || wrong; // wrong = !TestCopyIndexed() || wrong;
//wrong = !TestCopyValues() || wrong; // wrong = !TestCopyValues() || wrong;
//wrong = !TestDiv() || wrong; // wrong = !TestDiv() || wrong;
//wrong = !TestDivDim() || wrong; // wrong = !TestDivDim() || wrong;
//wrong = !TestExp() || wrong; // wrong = !TestExp() || wrong;
//wrong = !TestGather() || wrong; // wrong = !TestGather() || wrong;
//wrong = !TestLog() || wrong; // wrong = !TestLog() || wrong;
//wrong = !TestMatrixMul() || wrong; // wrong = !TestMatrixMul() || wrong;
//wrong = !TestMatrixMul2D() || wrong; // wrong = !TestMatrixMul2D() || wrong;
//wrong = !TestMatrixMul2DParallel() || wrong; // wrong = !TestMatrixMul2DParallel() || wrong;
//wrong = !TestMatrixMulBatched() || wrong; // wrong = !TestMatrixMulBatched() || wrong;
//wrong = !TestMerge() || wrong; // wrong = !TestMerge() || wrong;
//wrong = !TestMultiply() || wrong; // wrong = !TestMultiply() || wrong;
//wrong = !TestMultiplyDim() || wrong; //wrong = !TestMultiplyDim() || wrong;
//wrong = !TestNegate() || wrong; //wrong = !TestNegate() || wrong;
//wrong = !TestNormalize() || wrong; // wrong = !TestNormalize() || wrong;
//wrong = !TestPower() || wrong; // wrong = !TestPower() || wrong;
//wrong = !TestReduceMax() || wrong; // wrong = !TestReduceMax() || wrong;
//wrong = !TestReduceMean() || wrong; // wrong = !TestReduceMean() || wrong;
//wrong = !TestReduceSum() || wrong; // wrong = !TestReduceSum() || wrong;
//wrong = !TestReduceSumAll() || wrong; // wrong = !TestReduceSumAll() || wrong;
//wrong = !TestReduceSumSquared() || wrong; // wrong = !TestReduceSumSquared() || wrong;
//wrong = !TestReduceVariance() || wrong; // wrong = !TestReduceVariance() || wrong;
//wrong = !TestRound() || wrong; // wrong = !TestRound() || wrong;
//wrong = !TestScaleAndShift() || wrong; wrong = !TestScaleAndShift() || wrong;
//wrong = !TestSelect() || wrong; // wrong = !TestSelect() || wrong;
//wrong = !TestSetAscendingOrder() || wrong; // wrong = !TestSetAscendingOrder() || wrong;
//wrong = !TestSetData() || wrong; // wrong = !TestSetData() || wrong;
//wrong = !TestSign() || wrong; // wrong = !TestSign() || wrong;
//wrong = !TestSin() || wrong; // wrong = !TestSin() || wrong;
//wrong = !TestSort() || wrong; // wrong = !TestSort() || wrong;
//wrong = !TestSplit() || wrong; // wrong = !TestSplit() || wrong;
//wrong = !TestSpread() || wrong; // wrong = !TestSpread() || wrong;
//wrong = !TestSub() || wrong; // wrong = !TestSub() || wrong;
wrong = !TestSum() || wrong; //wrong = !TestSum() || wrong;
//wrong = !TestSumByColumnTV() || wrong; // wrong = !TestSumByColumnTV() || wrong;
//wrong = !TestSumByColumnVT() || wrong; // wrong = !TestSumByColumnVT() || wrong;
//wrong = !TestSumDim() || wrong; // wrong = !TestSumDim() || wrong;
//wrong = !TestTan() || wrong; // wrong = !TestTan() || wrong;
//wrong = !TestTranspose() || wrong; // wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong; // //wrong = !TestTopK() || wrong;
//wrong = !TestUnsqueeze() || wrong; // wrong = !TestUnsqueeze() || wrong;
//wrong = !TestXMem() || wrong; // wrong = !TestXMem() || wrong;
//
//wrong = !TestCrossEntropy() || wrong; // wrong = !TestCrossEntropy() || wrong;
//wrong = !TestDropout() || wrong; ////wrong = !TestDropout() || wrong;
//wrong = !TestHardTanH() || wrong; // wrong = !TestHardTanH() || wrong;
//wrong = !TestIdentity() || wrong; // wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong; //wrong = !TestLogSoftmax() || wrong;
//wrong = !TestLoss() || wrong; // wrong = !TestLoss() || wrong;
//wrong = !TestRectify() || wrong; // wrong = !TestRectify() || wrong;
//wrong = !TestSigmoid() || wrong; // wrong = !TestSigmoid() || wrong;
//wrong = !TestSoftmax() || wrong; // wrong = !TestSoftmax() || wrong;
/* other test */ /* other test */
/* /*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论