Commit 3187918c by linye

update float16 datatype of Sign, Sub, SubDim, SumDim

parent d10087d8
......@@ -48,6 +48,7 @@ void ReduceSumFP16Test();
void LogSoftmaxFP16Test();
void ClipFP16Test();
void ScaleAndShiftFP16Test();
void InitTensorFP16Test();
using namespace nts;
using namespace fnnlm;
......@@ -87,6 +88,8 @@ int main(int argc, const char ** argv )
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
//InitTensorFP16Test();
//return 0;
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
......@@ -106,6 +109,20 @@ int main(int argc, const char ** argv )
return 0;
}
void InitTensorFP16Test() {
XTensor a;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
XTensor halfA;
halfA = ConvertDataType(a, X_FLOAT16);
halfA.Dump(&halfA, stderr, "halfA:");
XTensor b;
InitTensor2D(&b, 1, 10, X_FLOAT16, 0);
_SetDataRand(&b, -10.0F, 10.0F);
b.Dump(&b, stderr, "b:");
}
void ScaleAndShiftFP16Test() {
XTensor a;
XTensor intA;
......
......@@ -189,7 +189,6 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
}
//XLossGrad lossGrad;
///* we start with the gradient with respect to the loss for output layers */
//for (int i = 0; i < roots.count; i++) {
// XTensor * root = (XTensor*)roots.Get(i);
......@@ -198,11 +197,9 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
// XLink &income = root->income;
// int funcID = income.typeID;
// void * params = income.params;
// /* we compute dE/dx if the output is generated by an activation function y = f(x).
// Note that we do not need to obtain dE/dy here because it is no use in the
// folloing process of back-propagation */
// if (gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)) {
// if (funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
// XTensor * x = income.tails[0];
......
......@@ -481,7 +481,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
///* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
......
......@@ -17,6 +17,7 @@
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "../../XDevice.h"
......@@ -33,15 +34,16 @@ set each entry to its sign value (CUDA Kernel)
>> b - pointer to output data array
>> size - size of the data array
*/
template<class T>
__global__
void KernelSign(DTYPE * a, DTYPE * b, int size)
void KernelSign(T * a, T * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (a[i] > 0)
if (i < size){
if (a[i] > (T)0)
b[i] = 1.0F;
else if (a[i] == 0)
else if (a[i] == (T)0)
b[i] = 0.0F;
else
b[i] = -1.0F;
......@@ -49,19 +51,6 @@ void KernelSign(DTYPE * a, DTYPE * b, int size)
}
/*
set each entry to its sign value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelSign(__half * a, __half * b, int size)
{
return;
}
/*
set each entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
......@@ -83,10 +72,10 @@ void _CudaSign(const XTensor * a, XTensor * b)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
KernelSign<<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
KernelSign<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its sign value (CUDA Kernel) */
template<class T>
__global__
void KernelSign(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
__global__
void KernelSign(__half * a, __half * b, int size);
void KernelSign(T * a, T * b, int size);
/* set each entry to its sign value */
void _CudaSign(const XTensor * a, XTensor * b);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "../../XDevice.h"
......@@ -36,8 +37,9 @@ c = a - b * \beta
>> size - the size of a/b/c
>> beta - the coefficient
*/
template<class T>
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
void KernelSUB(T * a, T * b, T * c, int size, T beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -77,7 +79,20 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelSUB<<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
else if (a->dataType == X_FLOAT16 &&
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
half beta1 = __float2half(beta);
KernelSUB<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, (__half*)c->data, a->unitNum, (__half)beta1);
}
else {
// TODO!!
......
......@@ -29,8 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* subtraction of data arrays (CUDA Kernel) */
template<class T>
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
void KernelSUB(T * a, T * b, T * c, int size, T beta = (T)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) */
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
......@@ -17,6 +17,7 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "SubDim.cuh"
......@@ -168,6 +169,34 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
half beta1 = __float2half(beta);
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1);
else
KernelSubWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, beta1);
else
KernelSubWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, beta1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "SumDim.cuh"
......@@ -170,6 +171,34 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
half beta1 = __float2half(beta);
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1);
else
KernelAddWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, beta1);
else
KernelAddWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, beta1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -108,7 +108,7 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
else
KernelScaleAndShift<__half, false, false> << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
}
else if (a->dataType == X_INT) {
else if (a->dataType == X_INT){
int scale2 = int(scale);
int shift2 = int(shift);
......@@ -121,7 +121,7 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if (a->dataType == X_INT8) {
else if (a->dataType == X_INT8){
__int8 scale2 = __int8(scale);
__int8 shift2 = __int8(shift);
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "TSign.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -110,6 +112,88 @@ bool TestSign1()
#endif // USE_CUDA
}
/*
case 2: float16 test Sign function.
Set every entry to its sign value.
*/
bool TestSign2()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call Sign function */
_Sign(&aHalfGPU, &bHalfGPU);
_SignMe(&aMeHalfGPU);
bUserHalfGPU = Sign(aHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -131,6 +215,16 @@ bool TestSign()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSign2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,9 +17,11 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "TSub.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -214,6 +216,177 @@ bool TestSub2()
#endif // USE_CUDA
}
/* case 3: float16 tensor subtraction c = a - b * \beta */
bool TestSub3()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 5.0F, 8.0F},
{11.0F, 14.0F, 17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sub function */
_Sub(&aHalfGPU, &bHalfGPU, &cHalfGPU);
_SubMe(&cMeHalfGPU, &bHalfGPU);
cUserHalfGPU = Sub(aHalfGPU, bHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F) &&
cMeGPU->CheckData(answer, unitNum, 1e-4F) &&
cUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: float16 tensor subtraction c = a - b * \beta */
bool TestSub4()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {-0.5F, 1.5F, 3.5F, 5.5F},
{7.5F, 9.5F, 11.5F, 13.5F} };
float beta = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sub function */
_Sub(&aHalfGPU, &bHalfGPU, &cHalfGPU, beta);
_SubMe(&cMeHalfGPU, &bHalfGPU, beta);
cUserHalfGPU = Sub(aHalfGPU, bHalfGPU, beta);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F) &&
cMeGPU->CheckData(answer, unitNum, 1e-4F) &&
cUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -243,6 +416,24 @@ bool TestSub()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSub3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSub4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,11 +17,13 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "TSubDim.h"
#include "../core/arithmetic/SubDim.h"
#include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -249,6 +251,206 @@ bool TestSubDim2()
#endif // USE_CUDA
}
/*
case 3: float16 tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
{5.0F, 6.0F, 7.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sub function */
_SubDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_SubDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = SubDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: float16 tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
{3.0F, 6.0F, 7.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sub function */
_SubDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SubDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = SubDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -278,6 +480,24 @@ bool TestSubDim()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSubDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSubDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -17,12 +17,14 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/
#include "TSumDim.h"
#include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -471,6 +473,310 @@ bool TestSumDim4()
#endif // USE_CUDA
}
/*
case 5: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2) = (2, 4), n = 0.
*/
bool TestSumDim5()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{3.0F, 4.0F, 5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_SumDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2, 2) = (2, 4), n = 1.
*/
bool TestSumDim6()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, 1.0F, 4.0F},
{5.0F, 4.0F, 5.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SumDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim7()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * answer = NewTensor(aOrder, aDimSize);
/* initialize variables */
_SetDataFixed(answer, 1.0F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMeGPU->SetZeroAll();
_SetDataFixed(bGPU, 1.0F);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SumDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -518,6 +824,33 @@ bool TestSumDim()
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSumDim5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestSumDim6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestSumDim7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -63,17 +63,18 @@ bool Test()
//wrong = !TestScaleAndShift() || wrong;
//wrong = !TestSelect() || wrong;
//wrong = !TestSetAscendingOrder() || wrong;
wrong = !TestSetData() || wrong;
//wrong = !TestSetData() || wrong;
//wrong = !TestSign() || wrong;
//wrong = !TestSin() || wrong;
//wrong = !TestSort() || wrong;
//wrong = !TestSplit() || wrong;
//wrong = !TestSpread() || wrong;
//wrong = !TestSub() || wrong;
//wrong = !TestSubDim() || wrong;
//wrong = !TestSum() || wrong;
//wrong = !TestSumByColumnTV() || wrong;
//wrong = !TestSumByColumnVT() || wrong;
//wrong = !TestSumDim() || wrong;
wrong = !TestSumDim() || wrong;
//wrong = !TestTan() || wrong;
//wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong;
......
......@@ -63,6 +63,7 @@
#include "TSplit.h"
#include "TSpread.h"
#include "TSub.h"
#include "TSubDim.h"
#include "TSum.h"
#include "TSumByColumnTV.h"
#include "TSumByColumnVT.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论