Commit 9d7cb741 by ltb

pull from linye in order to fixed the bug that fnnlm ppl is nan

parent 41dbf0a9
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/ */
#include <stdio.h> #include <stdio.h>
#include "XNet.h" #include "XNet.h"
...@@ -28,996 +28,219 @@ ...@@ -28,996 +28,219 @@
#include "../sample/fnnlm/FNNLM.h" #include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h" #include "../sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
//#include <crtdbg.h> //#include <crtdbg.h>
using namespace nts;
using namespace fnnlm;
using namespace transformer;
void BackwardTest();
void TransposeTest();
void SumDimTest();
void BackwardTest(); void BackwardTest();
void TransposeTest(); void TransposeTest();
void SumDimTest(); void SumDimTest();
//void SplitBackwardTest(); void ReadFP16Test();
void MemTest();
//void xcTest();
void ConvertDataTypeTest();
void ConvertDataTypeBackwardTest();
void SumFP16Test();
void GatherFP16Test();
void HardTanHFP16Test();
void ReduceMaxFP16Test();
void ReduceSumFP16Test();
void LogSoftmaxFP16Test();
void ClipFP16Test();
void ScaleAndShiftFP16Test();
void InitTensorFP16Test();
void MultiplyDimTime();
void TimeTestGemm();
void TimeTest();
void TimeInt8AndFloat32();
void TestCPUhalf();
int main(int argc, const char ** argv)
{
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if (argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if (argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else {
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
}
//xcTest();
//return 0;
//MemTest();
//return 0;
//SplitBackwardTest();
//return 0;
//_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
//Test();
//return 0;
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
//InitTensorFP16Test();
//return 0;
//_CrtDumpMemoryLeaks();
return 0;
}
void TestCPUhalf() {
int memSize = 1024;
int devId = 0;
int dim1 = 1024;
int dim2 = 32;
XMem * mem;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D(&a, dim1, dim1, X_FLOAT, devId);
InitTensor2D(&b, dim2, dim2, X_FLOAT, devId);
InitTensor2D(&c, dim1, dim1, X_FLOAT, devId);
}
void TimeInt8AndFloat32() {
XMem * mem;
int memSize = 1024;
int devId = 2;
int dim = 512;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&b, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&c, dim, dim, X_FLOAT, devId, mem);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
XTensor inta;
XTensor intb;
XTensor intc;
InitTensor2D(&inta, dim, dim, X_INT, devId, mem);
InitTensor2D(&intb, dim, dim, X_INT, devId, mem);
InitTensor2D(&intc, dim, dim, X_FLOAT, devId, mem);
XTensor tmp;
InitTensor2D(&tmp, dim, dim, X_FLOAT, devId, mem);
tmp.SetDataRand(-100000.0F, 100000.0F);
inta = ConvertDataType(tmp, X_INT8);
intb = ConvertDataType(tmp, X_INT8);
int repeat = 10000;
printf("test on matrixmul\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.2fs\n", elapsed_matrixmul32);
double start_int8 = GetClockSec();
for (int i = 0; i < repeat; i++) {
_MatrixMul(&inta, X_NOTRANS, &intb, X_NOTRANS, &intc);
}
double elapsed_int8 = GetClockSec() - start_int8;
printf("elapsed_int8=%.2fs\n", elapsed_int8);
}
void TimeTest() {
XMem * mem;
int memSize = 1024;
int devId = 0;
int dim = 512;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor2D(&a, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&b, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&c, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&halfc, dim, dim, X_FLOAT16, devId, mem);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
int repeat = 100000;
printf("=========================================\n");
printf("test on sum\n");
double start_sum32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Sum(&a, &b);
}
double elapsed_sum32 = GetClockSec() - start_sum32;
printf("elapsed_sum32=%.2fs\n", elapsed_sum32);
double start_sum16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Sum(&halfa, &halfb);
}
double elapsed_sum16 = GetClockSec() - start_sum16;
printf("elapsed_sum16=%.2fs\n", elapsed_sum16);
printf("=========================================\n");
/*printf("test on sub\n");
double start_sub32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Sub(&a, &b);
}
double elapsed_sub32 = GetClockSec() - start_sub32;
printf("elapsed_sub32=%.2fs\n", elapsed_sub32);
double start_sub16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Sub(&halfa, &halfb);
}
double elapsed_sub16 = GetClockSec() - start_sub16;
printf("elapsed_sub16=%.2fs\n", elapsed_sub16);
printf("=========================================\n");*/
/*printf("test on div\n");
double start_div32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Div(&a, &b);
}
double elapsed_div32 = GetClockSec() - start_div32;
printf("elapsed_div32=%.2fs\n", elapsed_div32);
double start_div16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Div(&halfa, &halfb);
}
double elapsed_div16 = GetClockSec() - start_div16;
printf("elapsed_div16=%.2fs\n", elapsed_div16);
printf("=========================================\n");*/
/*printf("test on multiply\n");
double start_multiply32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Multiply(&a, &b);
}
double elapsed_multiply32 = GetClockSec() - start_multiply32;
printf("elapsed_multiply32=%.2fs\n", elapsed_multiply32);
double start_multiply16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Multiply(&halfa, &halfb);
}
double elapsed_multiply16 = GetClockSec() - start_multiply16;
printf("elapsed_multiply16=%.2fs\n", elapsed_multiply16);
printf("=========================================\n");*/
printf("test on scaleandshift\n");
double start_scaleandshift32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ScaleAndShift(&a, 1, 0);
}
double elapsed_scaleandshift32 = GetClockSec() - start_scaleandshift32;
printf("elapsed_scaleandshift32=%.2fs\n", elapsed_scaleandshift32);
double start_scaleandshift16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ScaleAndShift(&halfa, 1, 0);
}
double elapsed_scaleandshift16 = GetClockSec() - start_scaleandshift16;
printf("elapsed_scaleandshift16=%.2fs\n", elapsed_scaleandshift16);
printf("=========================================\n");
printf("test on reducesum\n");
double start_reducesum32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ReduceSum(&a, 1);
}
double elapsed_reducesum32 = GetClockSec() - start_reducesum32;
printf("elapsed_reducesum32=%.2fs\n", elapsed_reducesum32);
double start_reducesum16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ReduceSum(&halfa, 1);
}
double elapsed_reducesum16 = GetClockSec() - start_reducesum16;
printf("elapsed_reducesum16=%.2fs\n", elapsed_reducesum16);
printf("=========================================\n");
printf("test on reducemax\n");
double start_reducemax32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ReduceMax(&a, 1);
}
double elapsed_reducemax32 = GetClockSec() - start_reducemax32;
printf("elapsed_reducemax32=%.2fs\n", elapsed_reducemax32);
double start_reducemax16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ReduceMax(&halfa, 1);
}
double elapsed_reducemax16 = GetClockSec() - start_reducemax16;
printf("elapsed_reducemax16=%.2fs\n", elapsed_reducemax16);
printf("=========================================\n");
printf("test on logsoftmax\n");
double start_logsoftmax32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = LogSoftmax(&a, 1);
}
double elapsed_logsoftmax32 = GetClockSec() - start_logsoftmax32;
printf("elapsed_logsoftmax32=%.2fs\n", elapsed_logsoftmax32);
double start_logsoftmax16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = LogSoftmax(&halfa, 1);
}
double elapsed_logsoftmax16 = GetClockSec() - start_logsoftmax16;
printf("elapsed_logsoftmax16=%.2fs\n", elapsed_logsoftmax16);
printf("=========================================\n");
printf("test on matrixmul\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = MatrixMul(&a, &b);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.2fs\n", elapsed_matrixmul32);
double start_matrixmul16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = MatrixMul(&halfa, &halfb);
}
double elapsed_matrixmul16 = GetClockSec() - start_matrixmul16;
printf("elapsed_matrixmul16=%.2fs\n", elapsed_matrixmul16);
printf("=========================================\n");
printf("test on convert\n");
double start_convert32to16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfa = ConvertDataType(a, X_FLOAT16);
}
double elapsed_convert32to16 = GetClockSec() - start_convert32to16;
printf("elapsed_convert32to16=%.2fs\n", elapsed_convert32to16);
double start_convert16to32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
a = ConvertDataType(halfa, X_FLOAT);
}
double elapsed_convert16to32 = GetClockSec() - start_convert16to32;
printf("elapsed_convert16to32=%.2fs\n", elapsed_convert16to32);
printf("=========================================\n");
delete mem;
}
void MultiplyDimTime() {
int memSize = 1024;
int devId = 0;
int dim1 = 1024;
int dim2 = 32;
XMem * mem;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D(&a, dim1, dim1, X_FLOAT, devId);
InitTensor2D(&b, dim2, dim2, X_FLOAT, devId);
InitTensor2D(&c, dim1, dim1, X_FLOAT, devId);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRandn(-1.0F, 1.0F);
int repeat = 2000;
printf("test on MultiplyDim\n");
double start = GetClockSec();
for (int j = 0; j <= repeat; j++) {
c = MultiplyDim(&a, &b, 0);
}
double elapsed = GetClockSec() - start;
printf("elapsed_MultiplyDim32=%.4fs \n", elapsed);
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor2D(&halfc, dim1, dim1, X_FLOAT16, devId, mem);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
double starthalf = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = MultiplyDim(&halfa, &halfb, 0);
}
double elapsedhalf = GetClockSec() - starthalf;
printf("elapsed_MultiplyDim16=%.4fs\n", elapsedhalf);
}
void TimeTestGemm() {
XMem * mem;
int memSize = 1024;
delete mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(0, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
XTensor halfa;
XTensor halfb;
XTensor halfc;
int dim1 = 512;
int dim2 = 1024;
//InitTensor3D(&a, 86, 48, 256, X_FLOAT, 0, mem);
//InitTensor2D(&b, 256, 256, X_FLOAT, 0, mem);
//InitTensor4D(&a, 8, 86, 48, 48, X_FLOAT, 0, mem);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0, mem);
InitTensor2D(&a, dim1, dim2, X_FLOAT, 0, mem);
InitTensor2D(&b, dim1, dim2, X_FLOAT, 0, mem);
//InitTensor4D(&a, 8, 86, 48, 32, X_FLOAT, 0);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
//a.Dump(&a, stderr, "a:", 10);
//b.Dump(&b, stderr, "b:", 10);
//halfa.Dump(&a, stderr, "halfa:", 10);
//halfb.Dump(&b, stderr, "halfb:", 10);
int repeat = 10000;
printf("=========================================\n");
double start_matrixmul16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = BMMul(halfa, X_NOTRANS, halfb, X_TRANS);
}
double elapsed_matrixmul16 = GetClockSec() - start_matrixmul16;
printf("elapsed_matrixmul16=%.4fs\n", elapsed_matrixmul16);
printf("------------------------------------------\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = BMMul(a, X_NOTRANS, b, X_TRANS);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.4fs\n", elapsed_matrixmul32);
printf("=========================================\n");
c.Dump(&c, stderr, "c:", 10);
halfc.Dump(&halfc, stderr, "halfc:", 10);
}
void InitTensorFP16Test() {
XTensor a;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
XTensor halfA;
halfA = ConvertDataType(a, X_FLOAT16);
halfA.Dump(&halfA, stderr, "halfA:");
XTensor b;
InitTensor2D(&b, 1, 10, X_FLOAT16, 0);
_SetDataRand(&b, -10.0F, 10.0F);
b.Dump(&b, stderr, "b:");
}
void ScaleAndShiftFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = ScaleAndShift(intA, 2, 0);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void ClipFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = Clip(intA, -1, 1);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void LogSoftmaxFP16Test() {
XTensor a;
XTensor halfA;
XTensor b;
XTensor halfB;
InitTensor3D(&a, 2, 2, 2, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
halfA = ConvertDataType(a, X_FLOAT16);
b = LogSoftmax(a, 1);
halfB = LogSoftmax(halfA, 1);
b.Dump(stderr, "sum:");
halfB.Dump(&halfB, stderr, "halfSum:");
}
void ReduceSumFP16Test()
{
XTensor a;
XTensor sum;
XTensor halfA;
XTensor halfSum;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
sum = ReduceSum(a, 1);
halfSum = ReduceSum(halfA, 1);
sum.Dump(stderr, "sum:");
halfSum.Dump(&halfSum, stderr, "halfSum:");
}
void ReduceMaxFP16Test()
{
XTensor a;
XTensor max;
XTensor halfA;
XTensor halfMax;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
max = ReduceMax(a, 1);
halfMax = ReduceMax(halfA, 1);
max.Dump(stderr, "max:");
halfMax.Dump(&halfMax, stderr, "halfMax:");
}
void HardTanHFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = HardTanH(a);
halfB = HardTanH(halfA);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void GatherFP16Test() {
XTensor a;
XTensor b;
XTensor srcIndex;
XTensor halfA;
XTensor halfB;
XTensor c;
InitTensor1D(&srcIndex, 2, X_INT, 0);
int m = 0;
int n = 1;
srcIndex.Set1DInt(m, 0);
srcIndex.Set1DInt(n, 1);
InitTensor2D(&a, 3, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
InitTensor2D(&halfB, 2, 2, X_FLOAT16, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
a.Dump(stderr, "a:");
_Gather(&a, &b, &srcIndex);
b.Dump(stderr, "b:");
_Gather(&halfA, &halfB, &srcIndex);
c = ConvertDataType(halfB, X_FLOAT);
c.Dump(stderr, "c:");
}
void SumFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = Sum(a, b, -0.4F);
halfB = Sum(halfA, halfB, -0.4F);
b.Dump(stderr, "b:"); using namespace nts;
halfB.Dump(&halfB, stderr, "halfB:"); using namespace fnnlm;
using namespace transformer;
}
void ConvertDataTypeTest() int main( int argc, const char ** argv )
{ {
int rnum = 0; //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
for (int i = 0; i <= rnum; i++) //_CrtSetBreakAlloc(2708);
{
XTensor a; //ReadFP16Test();
InitTensor2D(&a, 2, 2, X_FLOAT, 0); //return 0;
XTensor halfa; if(argc > 1 && !strcmp(argv[1], "-test"))
InitTensor2D(&halfa, 2, 2, X_FLOAT16, 0); Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
XTensor a1; FNNLMMain(argc - 1, argv + 1);
InitTensor2D(&a1, 2, 2, X_FLOAT, 0); else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
a.SetDataRand(-10.0F, 10.0F); else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
a.Dump(stderr, "a:"); fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
halfa = ConvertDataType(a, X_FLOAT16); fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
a1 = ConvertDataType(halfa, X_FLOAT); }
a1.Dump(stderr, "halfa:"); //_CrtDumpMemoryLeaks();
}
} return 0;
}
void ConvertDataTypeBackwardTest()
void ReadFP16Test()
{ {
int rnum = 0; XTensor a;
for (int i = 0; i <= rnum; i++) InitTensor2D(&a, 2, 3,X_FLOAT, 0);
{ a.SetDataRand(1.0, 5.0);
XTensor a; XTensor halfA;
InitTensor2D(&a, 2, 2, X_FLOAT, 0); halfA = ConvertDataType(a, X_FLOAT16);
a.SetDataRand(2.0F, 2.0F);
a.Dump(stderr, "a:");
XTensor halfA;
XTensor a1; XTensor halfB;
InitTensor2D(&halfB, 2, 3, X_FLOAT16, 0);
halfA = ConvertDataType(a, X_FLOAT16); halfA.Dump(&halfA, stderr, "halfA");
FILE* fOut1 = fopen("testReadFP16", "w");
halfA.Dump(&halfA, fOut1, "test:");
fclose(fOut1);
fflush(fOut1);
a1 = ConvertDataType(halfA, X_FLOAT); FILE* fOut2 = fopen("testReadFP16", "r");
halfB.Read(&halfB, fOut2, "test:");
fclose(fOut2);
fflush(fOut2);
a1.grad = NewTensor(&a1); halfB.Dump(&halfB, stderr, "halfB:");
a1.grad->SetDataRand(3.0F, 3.0F);
a1.grad->Dump(stderr, "a1.grad:");
XNet testBackward;
printf("1");
testBackward.Backward(a1);
printf("2");
halfA.grad->Dump(stderr, "halfA.grad:");
a.grad->Dump(stderr, "a.grad:");
}
} }
//XTensor * stack(XList& list, int leadingDim)
//{
// size_t size = list.count;
// if (list.count == 0)
// return NULL;
// XTensor * sample = (XTensor*)list.Get(0);
//
// XTensor merge_tensor;
// int order = sample->order;
// int * dim = new int[order];
// for (int i = 0; i < order; i++)
// dim[i] = sample->GetDim(i);
// dim[leadingDim] *= size;
//
// InitTensor(&merge_tensor, order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
//
// _Merge(&list, &merge_tensor, leadingDim);
// delete[] dim;
//
// order += 1;
// dim = new int[order];
// dim[0] = size;
// for (size_t i = 1; i < order; i++) {
// if (i != leadingDim)
// dim[i] = sample->GetDim(i - 1);
// else
// dim[i] = sample->GetDim(i - 1) / size;
// }
//
// XTensor * split_tensor = new XTensor(order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
// _Split(&merge_tensor, split_tensor, leadingDim, size);
// delete[] dim;
//
// return split_tensor;
//}
//void xcTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// _SetDataFixed(&t1, 1.0F);
// _SetDataFixed(&t2, 2.0F);
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&t1);
// smalls.Add(&t2);
//
// XTensor* result = stack(smalls, 0);
// result->Dump(stderr, "", 100);
//}
void BackwardTest() void BackwardTest()
{ {
XNet net; XNet net;
XTensor a; XTensor a;
XTensor b; XTensor b;
XTensor c; XTensor c;
XTensor mean; a.enableGrad = true;
XTensor origin; b.enableGrad = false;
InitTensor2D(&a, 2, 3); c.enableGrad = false;
InitTensor1D(&b, 2); XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll(); a.SetZeroAll();
b.SetZeroAll(); b.SetZeroAll();
a.Set2D(1.0F, 0, 0); a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1); a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2); a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0); a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1); a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2); a.Set2D(6.0F, 1, 2);
b.Set1D(2.0F, 0); b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1); b.Set1D(1.0F, 1);
c = DivDim(a, b, 0); DivDim(a, b, c, 0);
c.Dump(stderr, "c:"); c.Dump(stderr, "c:");
auto loss = CrossEntropy(c, a);
//XLink::ShowNetwork(stderr, &c); //XLink::ShowNetwork(stderr, &c);
net.Backward(c); net.Backward(loss);
net.Dump(stderr); a.grad->Dump(stderr);
} }
void TransposeTest() void TransposeTest()
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64); XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64); //XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x; XTensor x;
XTensor y; XTensor y;
XTensor z; XTensor z;
int loops = 2000; int loops = 2000;
int B = 3 * 2 * 4; int B = 3 * 2 * 4;
int K = 8 * 1; int K = 8 * 1;
int N = 50; int N = 50;
int H = 512 * 4; int H = 512 * 4;
int nnn = GDevs.nGPU; int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0); InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H / K, X_FLOAT, 0); InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
InitTensor3D(&z, B, N, H, X_FLOAT, 0); InitTensor3D(&z, B, N, H, X_FLOAT, 0);
cudaEvent_t ctime0; cudaEvent_t ctime0;
cudaEvent_t ctime1; cudaEvent_t ctime1;
cudaEvent_t ctime2; cudaEvent_t ctime2;
cudaEvent_t ctime3; cudaEvent_t ctime3;
cudaEvent_t ctime4; cudaEvent_t ctime4;
cudaEvent_t ctime5; cudaEvent_t ctime5;
float elapsedSplit = 0.0; float elapsedSplit = 0.0;
float elapsedMerge = 0.0; float elapsedMerge = 0.0;
float elapsedSum = 0.0; float elapsedSum = 0.0;
cudaEventCreate(&ctime0); cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1); cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2); cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3); cudaEventCreate(&ctime3);
cudaEventCreate(&ctime4); cudaEventCreate(&ctime4);
cudaEventCreate(&ctime5); cudaEventCreate(&ctime5);
cudaEventRecord(ctime0, 0); cudaEventRecord(ctime0, 0);
double time0 = GetClock(); double time0 = GetClock();
for (int i = 0; i < loops; i++) for(int i = 0; i < loops; i++)
_Split(&x, &y, 2, K); _Split(&x, &y, 2, K);
double time1 = GetClock(); double time1 = GetClock();
cudaEventRecord(ctime1, 0); cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1); cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1); cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0); cudaEventRecord(ctime2, 0);
double time2 = GetClock(); double time2 = GetClock();
for (int i = 0; i < loops; i++) for(int i = 0; i < loops; i++)
_Merge(&y, &x, 3); _Merge(&y, &x, 3);
double time3 = GetClock(); double time3 = GetClock();
cudaEventRecord(ctime3, 0); cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3); cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3); cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
cudaEventRecord(ctime4, 0); cudaEventRecord(ctime4, 0);
double time4 = GetClock(); double time4 = GetClock();
for (int i = 0; i < loops; i++) for(int i = 0; i < loops; i++)
_Sum(&x, &z, &x); _Sum(&x, &z, &x);
double time5 = GetClock(); double time5 = GetClock();
cudaEventRecord(ctime5, 0); cudaEventRecord(ctime5, 0);
cudaEventSynchronize(ctime5); cudaEventSynchronize(ctime5);
cudaEventElapsedTime(&elapsedSum, ctime4, ctime5); cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4); fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum); fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
#endif #endif
} }
void SumDimTest() void SumDimTest()
{ {
XTensor x; XTensor x;
XTensor y; XTensor y;
XTensor z; XTensor z;
int a = 5; int a = 5;
int b = 7; int b = 7;
int c = 3; int c = 3;
InitTensor3D(&x, a, b, c, X_FLOAT, -1); InitTensor3D(&x, a, b, c, X_FLOAT, -1);
InitTensor1D(&y, c, X_FLOAT, -1); InitTensor1D(&y, c, X_FLOAT, -1);
InitTensor3D(&z, a, b, c, X_FLOAT, -1); InitTensor3D(&z, a, b, c, X_FLOAT, -1);
x.SetZeroAll(); x.SetZeroAll();
y.SetZeroAll(); y.SetZeroAll();
z.SetZeroAll(); z.SetZeroAll();
DTYPE * data = new DTYPE[x.unitNum]; DTYPE * data = new DTYPE[x.unitNum];
for (int i = 0; i < x.unitNum; i++) for(int i = 0; i < x.unitNum; i++)
data[i] = (DTYPE)i; data[i] = (DTYPE)i;
x.SetData(data, x.unitNum); x.SetData(data, x.unitNum);
for (int i = 0; i < y.unitNum; i++) for(int i = 0; i < y.unitNum; i++)
data[i] = -(DTYPE)i; data[i] = -(DTYPE)i;
y.SetData(data, y.unitNum); y.SetData(data, y.unitNum);
_SumDim(&x, &y, &z, 2); _SumDim(&x, &y, &z, 2);
z.Dump(stderr, "z:"); z.Dump(stderr, "z:");
delete[] data; delete[] data;
} }
//void SplitBackwardTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// //_SetDataFixedFloat(&t1, 1.0F);
// //_SetDataFixedFloat(&t2, 2.0F);
// t1.SetDataRand();
// t2.SetDataRand();
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&first);
// smalls.Add(&second);
//
// Split(tensor, smalls, 1, 2);
//
// XTensor mul;
// mul = Sum(first, second);
//
// XNet net;
// net.Backward(mul);
// net.Dump(stderr);
//
// printf("Done!");
//}
void MemTest()
{
XMem * mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
XTensor tensor;
InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
tensor.SetZeroAll();
tensor.Dump(stderr);
delete mem;
if (tensor.mem != NULL) {
printf("It isn't null!\n");
printf("%d\n", (int)tensor.mem->signature);
}
else {
printf("It's null\n");
}
tensor.Dump(stderr);
}
\ No newline at end of file
...@@ -52,9 +52,11 @@ int sentBatch = 0; // batch size at the sentence level ...@@ -52,9 +52,11 @@ int sentBatch = 0; // batch size at the sentence level
int wordBatch = 1; // batch size at the word level int wordBatch = 1; // batch size at the word level
bool shuffled = false; // shuffled the training data file or not bool shuffled = false; // shuffled the training data file or not
bool autoDiff = false; // indicator of automatic differentiation bool autoDiff = false; // indicator of automatic differentiation
bool fp16 = false; // indicator of use of float16 computation
void LoadArgs(int argc, const char ** argv, FNNModel &model); void LoadArgs(int argc, const char ** argv, FNNModel &model);
void Init(FNNModel &model); void Init(FNNModel &model);
void InitFp16(FNNModel &model);
void Check(FNNModel &model); void Check(FNNModel &model);
void Copy(FNNModel &tgt, FNNModel &src); void Copy(FNNModel &tgt, FNNModel &src);
void Clear(FNNModel &model, bool isNodeGrad); void Clear(FNNModel &model, bool isNodeGrad);
...@@ -75,6 +77,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -75,6 +77,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
FNNModel &model, FNNModel &grad, FNNNet &net); FNNModel &model, FNNModel &grad, FNNNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model); void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model); void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
/* /*
entry of the program entry of the program
...@@ -121,11 +124,18 @@ int FNNLMMain(int argc, const char ** argv) ...@@ -121,11 +124,18 @@ int FNNLMMain(int argc, const char ** argv)
/* load arguments */ /* load arguments */
LoadArgs(argc, argv, model); LoadArgs(argc, argv, model);
srand(1);
/* check the setting */ /* check the setting */
Check(model); Check(model);
/* initialize model parameters */ /* initialize model parameters */
Init(model); if (!fp16) {
Init(model);
}
else {
InitFp16(model);
}
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
...@@ -224,6 +234,10 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model) ...@@ -224,6 +234,10 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
autoDiff = true; autoDiff = true;
fprintf(stderr, " -autodiff=true\n"); fprintf(stderr, " -autodiff=true\n");
} }
if (!strcmp(argv[i], "-fp16")) {
fp16 = true;
fprintf(stderr, " -fp16=true\n");
}
if(!strcmp(argv[i], "-dev") && i + 1 < argc){ if(!strcmp(argv[i], "-dev") && i + 1 < argc){
model.devID = atoi(argv[i + 1]); model.devID = atoi(argv[i + 1]);
fprintf(stderr, " -dev=%d\n", model.devID); fprintf(stderr, " -dev=%d\n", model.devID);
...@@ -303,6 +317,11 @@ void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model) ...@@ -303,6 +317,11 @@ void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
InitTensor1DV2(&tensor, num, X_FLOAT, model.devID); InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
} }
void InitModelTensor1DFp16(XTensor &tensor, int num, FNNModel &model)
{
InitTensor1DV2(&tensor, num, X_FLOAT16, model.devID);
}
/* /*
initialize a 2d tensor using the fnn model setting initialize a 2d tensor using the fnn model setting
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
...@@ -315,6 +334,10 @@ void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model) ...@@ -315,6 +334,10 @@ void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
} }
void InitModelTensor2DFp16(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
{
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT16, model.devID);
}
/* initialize the model */ /* initialize the model */
void Init(FNNModel &model) void Init(FNNModel &model)
...@@ -357,6 +380,48 @@ void Init(FNNModel &model) ...@@ -357,6 +380,48 @@ void Init(FNNModel &model)
for(int i = 0; i < model.hDepth; i++) for(int i = 0; i < model.hDepth; i++)
model.hiddenB[i].SetZeroAll(); model.hiddenB[i].SetZeroAll();
} }
/* initialize the model */
void InitFp16(FNNModel &model)
{
/* create embedding parameter matrix: vSize * eSize */
InitModelTensor2DFp16(model.embeddingW, model.vSize, model.eSize, model);
model.embeddingW.SetVarFlag();
/* create hidden layer parameter matrics */
for (int i = 0; i < model.hDepth; i++) {
/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
hsize * hsize otherwise */
if (i == 0)
InitModelTensor2DFp16(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
else
InitModelTensor2DFp16(model.hiddenW[i], model.hSize, model.hSize, model);
model.hiddenW[i].SetVarFlag();
/* bias term: a row vector of hSize entries */
InitModelTensor1DFp16(model.hiddenB[i], model.hSize, model);
model.hiddenB[i].SetVarFlag();
}
/* create the output layer parameter matrix and bias term */
int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
InitModelTensor2DFp16(model.outputW, iSize, model.vSize, model);
InitModelTensor1DFp16(model.outputB, model.vSize, model);
model.outputW.SetVarFlag();
model.outputB.SetVarFlag();
/* then, we initialize model parameters using a uniform distribution in range
of [-minmax, minmax] */
_SetDataRand(&model.embeddingW, -minmax, minmax);
_SetDataRand(&model.outputW, -minmax, minmax);
for (int i = 0; i < model.hDepth; i++)
_SetDataRand(&model.hiddenW[i], -minmax, minmax);
/* all bias terms are set to zero */
_SetDataFixed(&model.outputB, 0);
for (int i = 0; i < model.hDepth; i++)
_SetDataFixed(&model.hiddenB[i], 0);
}
/* /*
shuffle lines of the file shuffle lines of the file
...@@ -415,19 +480,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -415,19 +480,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
XNet autoDiffer; XNet autoDiffer;
double startT = GetClockSec(); double startT = GetClockSec();
double mkinput = 0.0;
double mkgold = 0.0;
double train_time = 0.0;
double clearModel = 0.0;
double forward=0.0;
double backward = 0.0;
double update = 0.0;
double end = 0.0;
double start = 0.0;
double time;
/* iterate for a number of epochs */ /* iterate for a number of epochs */
for(epoch = 0; epoch < nEpoch; epoch++){ for(epoch = 0; epoch < nEpoch; epoch++){
...@@ -438,6 +491,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -438,6 +491,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
wordCount = 0; wordCount = 0;
loss = 0; loss = 0;
ngramNum = 1; ngramNum = 1;
while(ngramNum > 0){ while(ngramNum > 0){
/* load a minibatch of ngrams */ /* load a minibatch of ngrams */
...@@ -458,25 +512,20 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -458,25 +512,20 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the loss tensor */ /* the loss tensor */
XTensor lossTensor; XTensor lossTensor;
start = GetClockSec();
/* make the input tensor for position i */ /* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++) for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID); MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
mkinput += GetClockSec() - start;
start = GetClockSec();
/* make the gold tensor */ /* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID); MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
mkgold += GetClockSec() - start;
time = GetClockSec();
if(!autoDiff){ if(!autoDiff){
/* prepare an empty network for building the fnn */ /* prepare an empty network for building the fnn */
FNNNet net; FNNNet net;
/* gradident = 0 */ /* gradident = 0 */
Clear(grad, false); Clear(grad, false);
/* forward computation */ /* forward computation */
Forward(inputs, output, model, net); Forward(inputs, output, model, net);
...@@ -491,60 +540,63 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -491,60 +540,63 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
loss -= prob; loss -= prob;
} }
else{ else{
start = GetClockSec();
/* gradient = 0 */ /* gradient = 0 */
Clear(model, true); Clear(model, true);
clearModel += GetClockSec() - start;
start = GetClockSec();
/* forward + backward process */ /* forward + backward process */
/* this is implemented by gather function */ /* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model); //ForwardAutoDiff(ngrams, ngramNum, output, model);
forward += GetClockSec() - start;
ForwardAutoDiffLin(ngrams, ngramNum, output, model);
start = GetClockSec();
/* this is implemented by multiply function */ //XNet net;
//net.ShowNetwork(stdout, &output);
//FILE* fOut1 = fopen("test-output", "w");
//output.Dump(&output, fOut1, "output");
//fclose(fOut1);
//fflush(fOut1);
//if (step==216)
//{
// exit(1);
//}
/* this is implemented by multiply function */
lossTensor = CrossEntropy(output, gold); lossTensor = CrossEntropy(output, gold);
//FILE* fOut1 = fopen("test3", "a");
//fprintf(fOut1, "step=%d ", step);
//lossTensor.Dump(&lossTensor, fOut1, "lossTensor:");
//fclose(fOut1);
//fflush(fOut1);
int stepTmp = step+1;
/* automatic differentiation */ /* automatic differentiation */
autoDiffer.Backward(lossTensor); autoDiffer.Backward(lossTensor);
backward += GetClockSec() - start;
start = GetClockSec();
/* update model parameters */ /* update model parameters */
Update(model, grad, learningRate, true); Update(model, grad, learningRate, true);
update += GetClockSec() - start;
start = GetClockSec();
/* get probabilities */ /* get probabilities */
float prob = ReduceSumAll(lossTensor); float prob = ReduceSumAll(lossTensor);
loss += prob; loss += prob;
end += GetClockSec() - start;
} }
train_time += GetClockSec() - time;
wordCount += ngramNum; wordCount += ngramNum;
wordCountTotal += ngramNum; wordCountTotal += ngramNum;
if(++step >= nStep){ if(++step >= nStep){
isEnd = true; isEnd = true;
break; break;
} }
if (step % 100 == 0) { if (step % 1 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
startT = GetClockSec();
XPRINT8(0, stderr, "[Time] mkinput=%.5lfs,mkgold=%.5lfs,train_time=%.5lfs,clearModel=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf, end=%.5lf\n",
mkinput, mkgold, train_time, clearModel, forward, backward, update,end);
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount)); elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
mkinput = 0.0;
mkgold = 0.0;
train_time = 0.0;
clearModel = 0.0;
forward = 0.0;
backward = 0.0;
update = 0.0;
end = 0.0;
} }
} }
...@@ -611,13 +663,21 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) ...@@ -611,13 +663,21 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
gradList.Add(model.embeddingW.grad); gradList.Add(model.embeddingW.grad);
} }
//FILE* fOut1 = fopen("test-2", "a");
for (int i = 0; i < paraList.count; i++) { for (int i = 0; i < paraList.count; i++) {
XTensor * para = (XTensor*)paraList.GetItem(i); XTensor * para = (XTensor*)paraList.GetItem(i);
XTensor * paraGrad = (XTensor*)gradList.GetItem(i); XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
//fprintf(fOut1, "id=%d ", para->id);
//para->Dump(para, fOut1, "para:", 50);
//paraGrad->Dump(paraGrad, fOut1, "paraGrad:", 50);
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -epsilon); _Sum(para, paraGrad, para, -epsilon);
} }
//fprintf(fOut1, "\n");
//fclose(fOut1);
//fflush(fOut1);
} }
/* /*
...@@ -761,6 +821,23 @@ void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, in ...@@ -761,6 +821,23 @@ void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, in
tensor.Set2D(1.0F, rows[i], cols[i]); tensor.Set2D(1.0F, rows[i], cols[i]);
} }
void InitZeroOneTensor2DFp16(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID)
{
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT16, devID);
XTensor tensor1;
InitTensor2DV2(&tensor1, rowNum, colNum, X_FLOAT, devID);
tensor1.SetZeroAll();
/* set none-zero cells */
for (int i = 0; i < itemNum; i++)
tensor1.Set2D(1.0F, rows[i], cols[i]);
_ConvertDataType(&tensor1, &tensor);
}
/* /*
make a tensor that encodes a batch of words make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words >> batch - the tensor encoding a batch of words
...@@ -780,7 +857,12 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz ...@@ -780,7 +857,12 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
cols[i] = ngrams[i].words[n]; cols[i] = ngrams[i].words[n];
} }
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID); if (!fp16) {
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID);
}
else {
InitZeroOneTensor2DFp16(batch, ngramNum, vSize, rows, cols, ngramNum, devID);
}
delete[] rows; delete[] rows;
delete[] cols; delete[] cols;
...@@ -1046,14 +1128,152 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1046,14 +1128,152 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for (int i = 0; i < depth; i++) {
//XTensor hiddenBefore;
//hiddenBefore = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
//if (hiddenBefore.dataType == X_FLOAT16) {
// XTensor hiddenBeforeFp32;
// hiddenBeforeFp32 = ConvertDataType(hiddenBefore, X_FLOAT);
// XTensor hiddenFp32;
// hiddenFp32 = HardTanH(hiddenBeforeFp32);
// hidden = ConvertDataType(hiddenFp32, X_FLOAT16);
//}
//else {
// hidden = HardTanH(hiddenBefore);
//}
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
}
/* output layer */ /* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XTensor softmaxBefore;
//softmaxBefore = MMul(hidden, model.outputW) + model.outputB;
//if (softmaxBefore.dataType == X_FLOAT16) {
// XTensor softmaxBeforeFp32;
// softmaxBeforeFp32 = ConvertDataType(softmaxBefore, X_FLOAT);
// XTensor outputeFp32;
// outputeFp32 = Softmax(softmaxBeforeFp32, 1);
// output = ConvertDataType(outputeFp32, X_FLOAT16);
//}
//else {
// output = Softmax(softmaxBefore, 1);
//}
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1); output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
int size = batch * (n - 1);
int * index = new int[size];
for (int i = 0; i < batch; i++) {
for (int j = 0; j < n - 1; j++) {
int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j];
}
}
InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size);
/*test for Gather float16 datatype backward*/
//XTensor embeddingW16;
//XTensor embeddingBig16;
//embeddingW16 = ConvertDataType(model.embeddingW, X_FLOAT16);
//embeddingBig16 = Gather(embeddingW16, words);
//embeddingBig = ConvertDataType(embeddingBig16, X_FLOAT);
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
int dimSize[2];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
/*test for Reshape float16 datatype backward*/
//XTensor embeddingBig16;
//XTensor hidden16;
//embeddingBig16 = ConvertDataType(embeddingBig, X_FLOAT16);
//hidden16 = Reshape(embeddingBig16, embeddingBig16.order, dimSize);
//hidden = ConvertDataType(hidden16, X_FLOAT);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */
for (int i = 0; i < depth; i++) {
/*test for MMul float16 backward*/
//XTensor hiddenW16;
//XTensor hidden16;
//XTensor hiddenBefore16;
//XTensor hiddenBefore;
//hiddenW16 = ConvertDataType(model.hiddenW[i], X_FLOAT16);
//hidden16 = ConvertDataType(hidden, X_FLOAT16);
//hiddenBefore16 = MMul(hidden16, hiddenW16);
//hiddenBefore = ConvertDataType(hiddenBefore16, X_FLOAT);
//hidden = HardTanH(hiddenBefore + model.hiddenB[i]);
/*test for HardTanH and Sum float16 backward*/
//XTensor hiddenBefore;
//XTensor hiddenBefore16;
//XTensor hiddenB16;
//XTensor hidden16;
//hiddenBefore = MMul(hidden, model.hiddenW[i]);
//hiddenBefore16 = ConvertDataType(hiddenBefore,X_FLOAT16);
//hiddenB16 = ConvertDataType(model.hiddenB[i], X_FLOAT16);
//hidden16 = HardTanH(hiddenBefore16 + hiddenB16);
//hidden = ConvertDataType(hidden16, X_FLOAT);
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
}
/* output layer */
/*test for MMul float16 backward*/
//XTensor outputW16;
//XTensor hidden16;
//XTensor outputBefore16;
//XTensor outputBefore;
//outputW16 = ConvertDataType(model.outputW, X_FLOAT16);
//hidden16 = ConvertDataType(hidden, X_FLOAT16);
//outputBefore16 = MMul(hidden16, outputW16);
//outputBefore = ConvertDataType(outputBefore16, X_FLOAT);
//output = Softmax(outputBefore + model.outputB, 1);
/*test for and Sum float16 backward*/
//XTensor outputBefore;
//XTensor outputBefore16;
//XTensor outputB16;
//XTensor output16;
//XTensor softmaxBefore16;
//XTensor softmaxBefore;
//outputBefore = MMul(hidden, model.outputW);
//outputBefore16 = ConvertDataType(outputBefore, X_FLOAT16);
//outputB16 = ConvertDataType(model.outputB, X_FLOAT16);
//softmaxBefore16 = outputBefore16 + outputB16;
//softmaxBefore = ConvertDataType(softmaxBefore16, X_FLOAT);
//output = Softmax(softmaxBefore, 1);
/*test for Softmax and Sum float16 backward*/
XTensor softmaxBefore;
XTensor softmaxBefore16;
XTensor output16;
softmaxBefore = MMul(hidden, model.outputW) + model.outputB;
softmaxBefore16 = ConvertDataType(softmaxBefore, X_FLOAT16);
output16 = Softmax(softmaxBefore16, 1);
output = ConvertDataType(output16, X_FLOAT);
//output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/* /*
forward process (with tensor connections) (this is implemented by multiply function) forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations >> inputs - input word representations
...@@ -1103,17 +1323,17 @@ void Dump(const char * fn, FNNModel &model) ...@@ -1103,17 +1323,17 @@ void Dump(const char * fn, FNNModel &model)
FILE * file = fopen(fn, "wb"); FILE * file = fopen(fn, "wb");
CheckErrors(file, "Cannot open the model file"); CheckErrors(file, "Cannot open the model file");
model.embeddingW.Dump(file, "embedding w:"); model.embeddingW.Dump(&model.embeddingW, file, "embedding w:");
for (int i = 0; i < model.hDepth; i++) { for (int i = 0; i < model.hDepth; i++) {
char name[MAX_NAME_LENGTH]; char name[MAX_NAME_LENGTH];
sprintf(name, "hidden %d w:", i); sprintf(name, "hidden %d w:", i);
model.hiddenW[i].Dump(file, name); model.hiddenW[i].Dump(&model.hiddenW[i], file, name);
sprintf(name, "hidden %d b:", i); sprintf(name, "hidden %d b:", i);
model.hiddenB[i].Dump(file, name); model.hiddenB[i].Dump(&model.hiddenB[i], file, name);
} }
model.outputW.Dump(file, "output w:"); model.outputW.Dump(&model.outputW, file, "output w:");
model.outputB.Dump(file, "output b:"); model.outputB.Dump(&model.outputB, file, "output b:");
fclose(file); fclose(file);
...@@ -1130,17 +1350,17 @@ void Read(const char * fn, FNNModel &model) ...@@ -1130,17 +1350,17 @@ void Read(const char * fn, FNNModel &model)
FILE * file = fopen(fn, "rb"); FILE * file = fopen(fn, "rb");
CheckErrors(file, "Cannot open the model file"); CheckErrors(file, "Cannot open the model file");
model.embeddingW.Read(file, "embedding w:"); model.embeddingW.Read(&model.embeddingW, file, "embedding w:");
for (int i = 0; i < model.hDepth; i++) { for (int i = 0; i < model.hDepth; i++) {
char name[MAX_NAME_LENGTH]; char name[MAX_NAME_LENGTH];
sprintf(name, "hidden %d w:", i); sprintf(name, "hidden %d w:", i);
model.hiddenW[i].Read(file, name); model.hiddenW[i].Read(&model.hiddenW[i], file, name);
sprintf(name, "hidden %d b:", i); sprintf(name, "hidden %d b:", i);
model.hiddenB[i].Read(file, name); model.hiddenB[i].Read(&model.hiddenB[i], file, name);
} }
model.outputW.Read(file, "output w:"); model.outputW.Read(&model.outputW, file, "output w:");
model.outputB.Read(file, "output b:"); model.outputB.Read(&model.outputB, file, "output b:");
fclose(file); fclose(file);
......
...@@ -148,14 +148,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -148,14 +148,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
double startT = GetClockSec(); double startT = GetClockSec();
double mkinput = 0.0;
double train_time = 0.0;
double forward = 0.0;
double backward = 0.0;
double update = 0.0;
double start = 0.0;
double time = 0.0;
for(epoch = 1; epoch <= nepoch; epoch++){ for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32 #ifndef WIN32
if(isShuffled) if(isShuffled)
...@@ -184,31 +176,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -184,31 +176,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */ /* label smoothed gold standard (if needed) */
XTensor goldSmoothed; XTensor goldSmoothed;
while (batchLoader.LoadBatch(file, model->isLM,
//while (batchLoader.LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
// &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label, NULL, vSize, vSizeTgt,
// NULL, vSize, vSizeTgt, sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
// sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
while (true)
{ {
start = GetClockSec();
int batch = batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true);
mkinput += GetClockSec() - start;
if (!batch) {
break;
}
time = GetClockSec();
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
start = GetClockSec();
/* make the network */ /* make the network */
if(model->isLM) if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true); model->MakeLM(batchEnc, output, paddingEnc, true);
...@@ -217,12 +196,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -217,12 +196,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
else{ else{
ShowNTErrors("Illegal model type!"); ShowNTErrors("Illegal model type!");
} }
forward += GetClockSec() - start;
/* back-propagation for obtaining gradients */ /* back-propagation for obtaining gradients */
//if (labelSmoothingP > 0) //if (labelSmoothingP > 0)
// LabelSmooth(&gold, &goldSmoothed, labelSmoothingP); // LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
start = GetClockSec();
XTensor labelOnehot; XTensor labelOnehot;
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP); labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
...@@ -251,9 +229,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -251,9 +229,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
net.Backward(lossTensor); net.Backward(lossTensor);
//net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY); //net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
//net.Backward(output, label, labelSmoothingP, CROSSENTROPY); //net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
backward += GetClockSec() - start;
start = GetClockSec();
gradStep += 1; gradStep += 1;
loss += prob; loss += prob;
wordCount += wc; wordCount += wc;
...@@ -272,13 +248,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -272,13 +248,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
gradStep = 0; gradStep = 0;
validStep++; validStep++;
update += GetClockSec() - start;
} }
} }
else else
nSkipped++; nSkipped++;
train_time += GetClockSec() - time;
if(++step >= nstep){ if(++step >= nstep){
isEnd = true; isEnd = true;
break; break;
...@@ -286,19 +260,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -286,19 +260,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (step % 100 == 0) { if (step % 100 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
startT = GetClockSec();
XPRINT6(0, stderr, "[Time] elapsed=%.5lfs,mkinput=%.5lfs,train_time=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf\n",
elapsed, mkinput,train_time, forward, backward, update);
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc)); elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
if (!doUpdate) if (!doUpdate)
XPRINT(0, stderr, " (no update)"); XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n"); XPRINT(0, stderr, "\n");
mkinput = 0.0;
train_time = 0.0;
forward = 0.0;
backward = 0.0;
update = 0.0;
} }
if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){ if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
......
...@@ -274,7 +274,7 @@ void T2TTest2() ...@@ -274,7 +274,7 @@ void T2TTest2()
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1); //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//myRead(probs, "probs.txt", " "); //myRead(probs, "probs.txt", " ");
_SetDataFixedFloat(probs, 1.0F); _SetDataFixed(probs, 1.0F);
probs->Reshape(1, probs->unitNum); probs->Reshape(1, probs->unitNum);
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
* $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes * $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
* *
*/ */
#include "halfLib/half/half.hpp"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -51,11 +50,6 @@ ...@@ -51,11 +50,6 @@
#include "function/Identity.h" #include "function/Identity.h"
#include "core/CHeader.h" #include "core/CHeader.h"
//#include "halfLib/HalfFloat/umHalf.h"
#ifdef USE_CUDA #ifdef USE_CUDA
// the CUDA stuff // the CUDA stuff
...@@ -382,7 +376,6 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -382,7 +376,6 @@ XTensor& XTensor::operator= (const XTensor& tensor)
XMemCopy(data, devID, tensor.data, tensor.devID, size); XMemCopy(data, devID, tensor.data, tensor.devID, size);
if(dataHost != NULL && tensor.dataHost != NULL) if(dataHost != NULL && tensor.dataHost != NULL)
XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size); XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
} }
else{ else{
DestroyData(); DestroyData();
...@@ -1861,16 +1854,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, ...@@ -1861,16 +1854,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
} }
} }
else if (dataType==X_FLOAT16) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for (int i = beg; i < end; i++) {
halfCPU f = ((halfCPU*)d)[i];
if (i == beg)
fprintf(file, "%hx", f);
else
fprintf(file, " %hx", f);
}
}
else if (dataType == X_INT) { else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum); int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){ for(int i = beg; i < end; i++){
...@@ -2010,14 +1993,6 @@ void XTensor::Read(FILE * file, const char * label) ...@@ -2010,14 +1993,6 @@ void XTensor::Read(FILE * file, const char * label)
} }
} }
} }
else if (dataType==X_FLOAT16){
for (int i = 0; i < unitNum; i++) {
halfCPU * f = ((halfCPU*)data) + i;
if (fscanf(file, "%hx", f) < 1) {
ShowNTErrors("Incorrect tensor format!");
}
}
}
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
...@@ -2044,13 +2019,43 @@ void XTensor::Read(FILE * file, const char * label) ...@@ -2044,13 +2019,43 @@ void XTensor::Read(FILE * file, const char * label)
} }
} }
do { do {
c = fgetc(file); c = fgetc(file);
} while (c != '\n' && c != EOF); } while (c != '\n' && c != EOF);
XMemCopy(dataBackup, devID, data, -1, GetDataSizeInChar()); XMemCopy(dataBackup, devID, data, -1, GetDataSizeInChar());
data = dataBackup; data = dataBackup;
delete[](char *)dataBuf;
delete[](char*)dataBuf;
}
/*
read data from a file
>> tensor - the tensor for reading
>> file - where to load the data
>> label - label of the tensor
*/
void XTensor::Read(XTensor * tensor, FILE * file, const char * label)
{
if (tensor->dataType == X_FLOAT)
{
XTensor * a = NewTensor(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
a->Read(file, label);
_CopyValues(a, tensor);
delete a;
}
else if (tensor->dataType == X_FLOAT16)
{
XTensor * a = NewTensor(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
a->Read(file, label);
_ConvertDataType(a, tensor);
delete a;
}
else
{
ShowNTErrors("TO DO!");
}
} }
/* /*
......
...@@ -433,6 +433,10 @@ public: ...@@ -433,6 +433,10 @@ public:
/* read data from a file */ /* read data from a file */
void Read(FILE * file, const char * label = NULL); void Read(FILE * file, const char * label = NULL);
/* read data from a file */
static
void Read(XTensor * tensor, FILE * file, const char * label = NULL);
/* flush the data to the target device */ /* flush the data to the target device */
void FlushToMem(XMem * targetMem); void FlushToMem(XMem * targetMem);
......
...@@ -34,8 +34,9 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) ...@@ -34,8 +34,9 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
>> c - result data array >> c - result data array
>> size - size of c >> size - size of c
*/ */
template <class T>
__global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size) void KernelMulElementWise(T * a, T * b, T * c, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -51,8 +52,9 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha ...@@ -51,8 +52,9 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
>> size - size of c >> size - size of c
>> alpha - the coefficient >> alpha - the coefficient
*/ */
template <class T>
__global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha) void KernelMulElementWiseV2(T * a, T * b, T * c, int size, T alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -75,13 +77,13 @@ where |a_lead| means the size of the leading dimension of a ...@@ -75,13 +77,13 @@ where |a_lead| means the size of the leading dimension of a
>> ldSizeC - size of the leading dimension of c >> ldSizeC - size of the leading dimension of c
>> blockNum - number of blocks >> blockNum - number of blocks
*/ */
template<int nonZeroAlpha> __global__ template<class T, int nonZeroAlpha> __global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, void KernelMulElementWiseTensorDynamic(T * a, T * b, T * c, T alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum) int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{ {
__shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -160,26 +162,56 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph ...@@ -160,26 +162,56 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]); dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0) if (alpha == 0)
KernelMulElementWise << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum); KernelMulElementWise <<<blocks, threads >>>((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum);
else else
KernelMulElementWiseV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum, alpha); KernelMulElementWiseV2 <<<blocks, threads >>>((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum, alpha);
} }
else { else {
GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) { if (alpha == 0) {
KernelMulElementWiseTensorDynamic<0> << <blocks, threads >> > KernelMulElementWiseTensorDynamic<DTYPE, 0> <<<blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum); stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
} }
else { else {
KernelMulElementWiseTensorDynamic<1> << <blocks, threads >> > KernelMulElementWiseTensorDynamic<DTYPE, 1> <<<blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum); stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
} }
} }
} }
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
half alpha1 = __float2half(alpha);
int cudaGridSize[3];
int cudaBlockSize[3];
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelMulElementWise <<<blocks, threads >>>((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum);
else
KernelMulElementWiseV2 <<<blocks, threads >>>((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha1);
}
else {
GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (alpha == 0) {
KernelMulElementWiseTensorDynamic<__half, 0> <<<blocks, threads>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelMulElementWiseTensorDynamic<__half, 1> <<<blocks, threads>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data, alpha1,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -29,16 +29,18 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,16 +29,18 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
template <class T>
__global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size); void KernelMulElementWise(T * a, T * b, T * c, int size);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
template <class T>
__global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha); void KernelMulElementWiseV2(T * a, T * b, T * c, int size, T alpha);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */
template<int nonZeroAlpha>__global__ template<class T, int nonZeroAlpha>__global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum); void KernelMulElementWiseTensorDynamic(T * a, T * b, T * c, T alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise product of two tensors */ /* element-wise product of two tensors */
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
......
...@@ -161,11 +161,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -161,11 +161,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_16F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH); cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
} }
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) { else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
...@@ -173,11 +173,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -173,11 +173,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, a, CUDA_R_16F, ma, (void*)&beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH); cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
} }
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) { else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
...@@ -193,11 +193,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -193,11 +193,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha, b, CUDA_R_8I, mb, a, CUDA_R_8I, ma, &beta, c, CUDA_R_32F, mc, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH); cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
} }
else { else {
...@@ -246,11 +246,11 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -246,11 +246,11 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH); cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
} }
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) { else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT) {
...@@ -278,11 +278,11 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -278,11 +278,11 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, ma, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha, b, CUDA_R_8I, mb, strideB, a, CUDA_R_8I, ma, strideA, &beta, c, CUDA_R_32F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH); cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
} }
else { else {
......
...@@ -181,6 +181,11 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \ ...@@ -181,6 +181,11 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
Kernel##funcName<<<blocks, threads>>> \ Kernel##funcName<<<blocks, threads>>> \
((int*)a->data, (int*)b->data, a->unitNum); \ ((int*)a->data, (int*)b->data, a->unitNum); \
} \ } \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName<<<blocks, threads>>> \
((half*)a->data, \
(half*)b->data, a->unitNum); \
} \
else { \ else { \
ShowNTErrors("TODO!"); \ ShowNTErrors("TODO!"); \
} \ } \
......
...@@ -234,7 +234,6 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -234,7 +234,6 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
int dim = 0; int dim = 0;
int order = source->order; int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!"); CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
CheckNTErrors(collection->unitNum/collection->GetDim(-1) == index->unitNum, CheckNTErrors(collection->unitNum/collection->GetDim(-1) == index->unitNum,
"Illegal dimension!"); "Illegal dimension!");
......
...@@ -330,12 +330,13 @@ Care of the operator "+=" instead of "=". ...@@ -330,12 +330,13 @@ Care of the operator "+=" instead of "=".
>> indexSize - the number of index >> indexSize - the number of index
>> stride - stride of a data block >> stride - stride of a data block
*/ */
template <class T, TENSOR_DATA_TYPE datatype>
__global__ __global__
void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex, void KernelSpreadForGather(T * sData, T * cData, int * srcIndex,
int indexSize, int stride) int indexSize, int stride)
{ {
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */ /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -353,13 +354,18 @@ void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex, ...@@ -353,13 +354,18 @@ void KernelSpreadForGather(DTYPE * sData, DTYPE * cData, int * srcIndex,
__syncthreads(); __syncthreads();
DTYPE * s = sp[threadIdx.x]; T * s = sp[threadIdx.x];
DTYPE * c = cp[threadIdx.x]; T * c = cp[threadIdx.x];
//DTYPE * s = sData + srcIndex[i] * stride; //DTYPE * s = sData + srcIndex[i] * stride;
//DTYPE * c = cData + i * stride; //DTYPE * c = cData + i * stride;
atomicAdd(s + offset, c[offset]); if (datatype == X_FLOAT) {
atomicAdd(((DTYPE*)s + offset), *((DTYPE*)c + offset));
}
else if (datatype == X_FLOAT16) {
atomicAdd(((__half2*)s + offset), *((__half2*)c + offset));
}
} }
/* /*
...@@ -372,6 +378,10 @@ And this is a special spread function for backward computation of gather functio ...@@ -372,6 +378,10 @@ And this is a special spread function for backward computation of gather functio
*/ */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex) void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex)
{ {
CheckNTErrors((source->dataType == X_FLOAT) ||
(source->dataType == X_FLOAT16),
"Unmatched tensors in gather!");
int devID = source->devID; int devID = source->devID;
XMem * mem = source->mem; XMem * mem = source->mem;
...@@ -384,8 +394,6 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -384,8 +394,6 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
int devIDBackup; int devIDBackup;
ProtectCudaDev(source->devID, devIDBackup); ProtectCudaDev(source->devID, devIDBackup);
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
int * sIndex = NULL; int * sIndex = NULL;
GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
...@@ -402,7 +410,19 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -402,7 +410,19 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
else else
sIndex = (int *)srcIndex->data; sIndex = (int *)srcIndex->data;
KernelSpreadForGather<<<blocks, threads >>>(sData, cData, sIndex, indexSize, stride); if (source->dataType == DEFAULT_DTYPE && collection->dataType == DEFAULT_DTYPE)
{
DTYPE * sData = (DTYPE*)source->data;
DTYPE * cData = (DTYPE*)collection->data;
KernelSpreadForGather<DTYPE, X_FLOAT> << <blocks, threads >> >(sData, cData, sIndex, indexSize, stride);
}
else if (source->dataType == X_FLOAT16 && collection->dataType == X_FLOAT16)
{
__half2 * sData = (__half2*)source->data;
__half2 * cData = (__half2*)collection->data;
KernelSpreadForGather<__half2, X_FLOAT16> << <blocks, threads >> >(sData, cData, sIndex, indexSize, stride);
}
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if(mem != NULL)
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "ReduceSumAll.h" #include "ReduceSumAll.h"
#include "ReduceSum.h" #include "ReduceSum.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "../getandset/ConvertDataType.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -54,8 +55,19 @@ DTYPE _ReduceSumAll(const XTensor * source) ...@@ -54,8 +55,19 @@ DTYPE _ReduceSumAll(const XTensor * source)
_CopyValues(source, all); _CopyValues(source, all);
_ReduceSum(all, result, 1); _ReduceSum(all, result, 1);
XTensor result1(result->order, result->dimSize, X_FLOAT, result->denseRatio, result->devID, result->mem);
if (result->dataType == X_FLOAT)
{
_CopyValues(result, &result1);
}
else if (result->dataType == X_FLOAT16)
{
_ConvertDataType(result, &result1);
}
DTYPE r = result->Get1D(0); DTYPE r = result1.Get1D(0);
DelTensorBuf(result); DelTensorBuf(result);
DelTensorBuf(all); DelTensorBuf(all);
......
...@@ -269,6 +269,14 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -269,6 +269,14 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
KernelUnsqueezeByCol<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelUnsqueezeByCol<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data); (a->data, blockNumA, dSize, b->data);
} }
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
if (cudaBlocks[1] == 1)
KernelUnsqueezeByColBigRow<half> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
else
KernelUnsqueezeByCol<half> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
}
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -97,7 +97,7 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem) ...@@ -97,7 +97,7 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
/* copy the data from GPU memory to CPU memory */ /* copy the data from GPU memory to CPU memory */
void CudaGPUToCPUFlush(XTensor * tensor) void CudaGPUToCPUFlush(XTensor * tensor)
{ {
//CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type."); CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
if (tensor->dataHost != NULL) if (tensor->dataHost != NULL)
delete[](char*)tensor->dataHost; delete[](char*)tensor->dataHost;
......
...@@ -322,88 +322,88 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -322,88 +322,88 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int devIDBackup; int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup); ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){ //if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors((lossName == CROSSENTROPY || CheckNTErrors((lossName == CROSSENTROPY ||
lossName == SQUAREDERROR || lossName == SQUAREDERROR ||
lossName == ONEHOTERROR || lossName == ONEHOTERROR ||
lossName == NOLOSS), lossName == NOLOSS),
"Unknown loss function."); "Unknown loss function.");
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){ if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F); _Sum(y, gold, dedx, -1.0F);
if(padding != NULL) { if(padding != NULL) {
int paddingOrder = padding->order; int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder]; int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int)); memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum); padding->Reshape(padding->unitNum);
int order = dedx->order; int order = dedx->order;
int * dims = new int[order]; int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int)); memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n)); dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0); _MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims); padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims); dedx->Reshape(order, dims);
delete[] paddingDims; delete[] paddingDims;
delete[] dims; delete[] dims;
}
} }
else if(lossName == ONEHOTERROR){ }
ShowNTErrors("TODO!"); else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!");
}
else if(lossName == NOLOSS){
/*
for softmax:
y_i = e^{x_i} / \sum_{k} e^{x_k}
we have
dy_i/ds_j = y_i * (\delta(i,j) - y_j)
Then
dE/dx_j = \sum_i dE/dy_i * dy_i/dx_j
= \sum_i dE/dy_i * y_i * (\delta(i,j) - y_j)
= dE/dy_j * y_j - y_j * \beta
= y_j * (dE/dy_j - \beta)
where
\beta = \sum_i (dE/dy_i * y_i)
*/
int * dimSize = new int[y->order];
for(int i = 0; i < y->order; i++){
if(i < leadDim)
dimSize[i] = y->dimSize[i];
else if(i > leadDim)
dimSize[i - 1] = y->dimSize[i];
} }
else if(lossName == NOLOSS){
/*
for softmax:
y_i = e^{x_i} / \sum_{k} e^{x_k}
we have
dy_i/ds_j = y_i * (\delta(i,j) - y_j)
Then
dE/dx_j = \sum_i dE/dy_i * dy_i/dx_j
= \sum_i dE/dy_i * y_i * (\delta(i,j) - y_j)
= dE/dy_j * y_j - y_j * \beta
= y_j * (dE/dy_j - \beta)
where
\beta = \sum_i (dE/dy_i * y_i)
*/
int * dimSize = new int[y->order];
for(int i = 0; i < y->order; i++){
if(i < leadDim)
dimSize[i] = y->dimSize[i];
else if(i > leadDim)
dimSize[i - 1] = y->dimSize[i];
}
/* make a matrix of the same size as the y (i.e., y) */ /* make a matrix of the same size as the y (i.e., y) */
XTensor * ytmp = NewTensor(y); XTensor * ytmp = NewTensor(y);
/* make a matrix to keep \beta */ /* make a matrix to keep \beta */
XTensor * beta = NewTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, y->mem); XTensor * beta = NewTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
/* \beta = \sum_i (dE/dy_i * y_i) */ /* \beta = \sum_i (dE/dy_i * y_i) */
_Multiply(dedy, y, ytmp, 0, 0); _Multiply(dedy, y, ytmp, 0, 0);
_ReduceSum(ytmp, beta, leadDim); _ReduceSum(ytmp, beta, leadDim);
/* ytmp = dE/dy_j - \beta */ /* ytmp = dE/dy_j - \beta */
_Unsqueeze(beta, ytmp, leadDim, y->dimSize[leadDim]); _Unsqueeze(beta, ytmp, leadDim, y->dimSize[leadDim]);
_Sum(dedy, ytmp, ytmp, -1.0F); _Sum(dedy, ytmp, ytmp, -1.0F);
/* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */ /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
_Multiply(y, ytmp, dedx, 0, 0); _Multiply(y, ytmp, dedx, 0, 0);
delete[] dimSize; delete[] dimSize;
delete ytmp; delete ytmp;
delete beta; delete beta;
}
else{
ShowNTErrors("TODO!");
}
} }
else else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
}
//}
//else
// ShowNTErrors("TODO!");
BacktoCudaDev(x->devID, devIDBackup); BacktoCudaDev(x->devID, devIDBackup);
} }
......
#include <iostream>
#include <assert.h>
#include <direct.h>
#include "../core/utilities/FlushToMem.h"
#include "../core/getandset/ConvertDataType.h"
#include "../XTensor.h"
#include "umHalf.h"
using namespace nts;
//#define VALIDATE(x) if (!(x)){std::cout << "Failed: " << #x << std::endl;assert((x));}
int main(int argc, char* argv[])
{
char *path;
path = getcwd(NULL, 0);
strcat(path, "\\source\\tensor\\HalfFloat\\dump");
XTensor a;
XTensor halfa;
int dim = 4;
int devId = 0;
InitTensor2DV2(&a,dim,dim,X_FLOAT,devId);
a.SetDataRand(-2.0,2.0);
halfa = ConvertDataType(a, X_FLOAT16);
halfa.Dump(&halfa, stderr, "halfa:");
GPUToCPUFlush(&halfa);
FILE * file = fopen(path, "wb");
halfa.Dump(file, "halfa:");
fclose(file);
XTensor halfb;
InitTensor2DV2(&halfb, dim, dim, X_FLOAT16, devId);
FILE *read = fopen(path, "rb");
halfb.Read(read, "halfa:");
fclose(read);
halfb.Dump(&halfb, stderr, "halfb:");
//half h = 1.f, h2 = 2.f;
//--h2;
//++h2;
//--h;
//++h;
//h2 -= 1.f;
//float f = h2, f2 = h;
//VALIDATE(1.f == f && f == f2);
//half dddd = 15.5;
//float hhhh = 15.5;
//printf("%x\n", dddd);
//printf("%x\n", hhhh);
//h = h2;
//h2 = 15.5f;
//f = h2, f2 = h;
//VALIDATE(15.5f == f && 1.f == f2);
//h2 *= h;
//f = h2, f2 = h;
//VALIDATE(15.5f == f && 1.f == f2);
//h2 /= h;
//f = h2, f2 = h;
//VALIDATE(15.5f == f && 1.f == f2);
//h2 += h;
//f = h2, f2 = h;
//VALIDATE(16.5f == f && 1.f == f2);
//h++; h++; h++;
//h2 = -h2;
//h2 += 17.5f;
//h2 *= h;
//f = h2, f2 = h;
//VALIDATE(4.f == f && 4.f == f2);
//VALIDATE(h == h2);
//VALIDATE(h <= h2);
//--h;
//VALIDATE(h <= h2);
//h -= 250.f;
//VALIDATE(h < h2);
//h += 500.f;
//VALIDATE(h > h2);
//VALIDATE(h >= h2);
//f = h2, f2 = h;
//VALIDATE(h * h2 == (half)(f * f2));
//// addition
//// ****************************************************************************
//// identical exponents
//for (float f = 0.f; f < 1000.f; ++f)
//{
// half one = f;
// half two = f;
// half three = one + two;
// f2 = three;
// VALIDATE(f*2.f == f2);
//}
//// different exponents
//for (float f = 0.f, fp = 1000.f; f < 500.f; ++f, --fp)
//{
// half one = f;
// half two = fp;
// half three = one + two;
// f2 = three;
// VALIDATE(f + fp == f2);
//}
//// very small numbers - this is already beyond the accuracy of 16 bit floats.
//for (float f = 0.003f; f < 1000.f; f += 0.0005f)
//{
// half one = f;
// half two = f;
// half three = one + two;
// f2 = three;
// float m = f * 2.f;
// VALIDATE(f2 > (m - 0.05*m) && f2 < (m + 0.05*m));
//}
//// subtraction
//// ****************************************************************************
//// identical exponents
//for (float f = 0.f; f < 1000.f; ++f)
//{
// half one = f;
// half two = f;
// half three = one - two;
// f2 = three;
// VALIDATE(0.f == f2);
//}
//// different exponents
//for (float f = 0.f, fp = 1000.f; f < 500.f; ++f, --fp)
//{
// half one = f;
// half two = fp;
// half three = one - two;
// f2 = three;
// VALIDATE(f - fp == f2);
//}
return 0;
}
https://github.com/acgessler/half_float
C++ implementation of a 16 bit floating-point type mimicking most of the IEEE 754 behaviour. Compatible with the half data type used as texture format by OpenGl/Direct3D.
\ No newline at end of file
halfa: order=2 dimsize=4,4 dtype=X_FLOAT16 dense=1.000000
be2c 3ffd bf2c 3c52 a8f6 3a6a afcf 3eca 3e47 3852 bf6e 3bc8 bff5 bc12 b266 31a4
#include <iostream>
#include <assert.h>
#include <direct.h>
#include "../../core/utilities/FlushToMem.h"
#include "../../core/getandset/ConvertDataType.h"
#include "../../XTensor.h"
#include "../../XGlobal.h"
#include "umHalf.h"
using namespace nts;
int main(int argc, char* argv[])
{
char *path;
path = getcwd(NULL, 0);
strcat(path, "\\source\\tensor\\halfLib\\HalfFloat\\dump");
XTensor a;
XTensor halfa;
int dim = 4;
int devId = 0;
InitTensor2DV2(&a, dim, dim, X_FLOAT, devId);
a.SetDataRand(-2.0, 2.0);
halfa = ConvertDataType(a, X_FLOAT16);
printf("============save model================\n");
halfa.Dump(&halfa, stderr, "halfa:");
GPUToCPUFlush(&halfa);
FILE * file = fopen(path, "wb");
halfa.Dump(file, "halfa:");
//a.Dump(file, "a");
fclose(file);
XTensor halfb;
InitTensor2DV2(&halfb, dim, dim, X_FLOAT16, devId);
XTensor b;
InitTensor2DV2(&b, dim, dim, X_FLOAT, devId);
printf("==============read model=============\n");
FILE *read = fopen(path, "rb");
halfb.Read(read, "halfa:");
//b.Read(read, "a");
fclose(read);
halfb.Dump(&halfb, stderr, "halfb:");
return 0;
}
\ No newline at end of file
// ISO C9x compliant stdint.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The name of the author may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_STDINT_H_ // [
#define _MSC_STDINT_H_
#if _MSC_VER > 1000
#pragma once
#endif
#include <limits.h>
// For Visual Studio 6 in C++ mode wrap <wchar.h> include with 'extern "C++" {}'
// or compiler give many errors like this:
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
#if (_MSC_VER < 1300) && defined(__cplusplus)
extern "C++" {
#endif
# include <wchar.h>
#if (_MSC_VER < 1300) && defined(__cplusplus)
}
#endif
// 7.18.1 Integer types
// 7.18.1.1 Exact-width integer types
typedef __int8 int8_t;
typedef __int16 int16_t;
typedef __int32 int32_t;
typedef __int64 int64_t;
typedef unsigned __int8 uint8_t;
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
// 7.18.1.2 Minimum-width integer types
typedef int8_t int_least8_t;
typedef int16_t int_least16_t;
typedef int32_t int_least32_t;
typedef int64_t int_least64_t;
typedef uint8_t uint_least8_t;
typedef uint16_t uint_least16_t;
typedef uint32_t uint_least32_t;
typedef uint64_t uint_least64_t;
// 7.18.1.3 Fastest minimum-width integer types
typedef int8_t int_fast8_t;
typedef int16_t int_fast16_t;
typedef int32_t int_fast32_t;
typedef int64_t int_fast64_t;
typedef uint8_t uint_fast8_t;
typedef uint16_t uint_fast16_t;
typedef uint32_t uint_fast32_t;
typedef uint64_t uint_fast64_t;
// 7.18.1.4 Integer types capable of holding object pointers
#ifdef _WIN64 // [
typedef __int64 intptr_t;
typedef unsigned __int64 uintptr_t;
#else // _WIN64 ][
typedef int intptr_t;
typedef unsigned int uintptr_t;
#endif // _WIN64 ]
// 7.18.1.5 Greatest-width integer types
typedef int64_t intmax_t;
typedef uint64_t uintmax_t;
// 7.18.2 Limits of specified-width integer types
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
// 7.18.2.1 Limits of exact-width integer types
#define INT8_MIN ((int8_t)_I8_MIN)
#define INT8_MAX _I8_MAX
#define INT16_MIN ((int16_t)_I16_MIN)
#define INT16_MAX _I16_MAX
#define INT32_MIN ((int32_t)_I32_MIN)
#define INT32_MAX _I32_MAX
#define INT64_MIN ((int64_t)_I64_MIN)
#define INT64_MAX _I64_MAX
#define UINT8_MAX _UI8_MAX
#define UINT16_MAX _UI16_MAX
#define UINT32_MAX _UI32_MAX
#define UINT64_MAX _UI64_MAX
// 7.18.2.2 Limits of minimum-width integer types
#define INT_LEAST8_MIN INT8_MIN
#define INT_LEAST8_MAX INT8_MAX
#define INT_LEAST16_MIN INT16_MIN
#define INT_LEAST16_MAX INT16_MAX
#define INT_LEAST32_MIN INT32_MIN
#define INT_LEAST32_MAX INT32_MAX
#define INT_LEAST64_MIN INT64_MIN
#define INT_LEAST64_MAX INT64_MAX
#define UINT_LEAST8_MAX UINT8_MAX
#define UINT_LEAST16_MAX UINT16_MAX
#define UINT_LEAST32_MAX UINT32_MAX
#define UINT_LEAST64_MAX UINT64_MAX
// 7.18.2.3 Limits of fastest minimum-width integer types
#define INT_FAST8_MIN INT8_MIN
#define INT_FAST8_MAX INT8_MAX
#define INT_FAST16_MIN INT16_MIN
#define INT_FAST16_MAX INT16_MAX
#define INT_FAST32_MIN INT32_MIN
#define INT_FAST32_MAX INT32_MAX
#define INT_FAST64_MIN INT64_MIN
#define INT_FAST64_MAX INT64_MAX
#define UINT_FAST8_MAX UINT8_MAX
#define UINT_FAST16_MAX UINT16_MAX
#define UINT_FAST32_MAX UINT32_MAX
#define UINT_FAST64_MAX UINT64_MAX
// 7.18.2.4 Limits of integer types capable of holding object pointers
#ifdef _WIN64 // [
# define INTPTR_MIN INT64_MIN
# define INTPTR_MAX INT64_MAX
# define UINTPTR_MAX UINT64_MAX
#else // _WIN64 ][
# define INTPTR_MIN INT32_MIN
# define INTPTR_MAX INT32_MAX
# define UINTPTR_MAX UINT32_MAX
#endif // _WIN64 ]
// 7.18.2.5 Limits of greatest-width integer types
#define INTMAX_MIN INT64_MIN
#define INTMAX_MAX INT64_MAX
#define UINTMAX_MAX UINT64_MAX
// 7.18.3 Limits of other integer types
#ifdef _WIN64 // [
# define PTRDIFF_MIN _I64_MIN
# define PTRDIFF_MAX _I64_MAX
#else // _WIN64 ][
# define PTRDIFF_MIN _I32_MIN
# define PTRDIFF_MAX _I32_MAX
#endif // _WIN64 ]
#define SIG_ATOMIC_MIN INT_MIN
#define SIG_ATOMIC_MAX INT_MAX
#ifndef SIZE_MAX // [
# ifdef _WIN64 // [
# define SIZE_MAX _UI64_MAX
# else // _WIN64 ][
# define SIZE_MAX _UI32_MAX
# endif // _WIN64 ]
#endif // SIZE_MAX ]
// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
#ifndef WCHAR_MIN // [
# define WCHAR_MIN 0
#endif // WCHAR_MIN ]
#ifndef WCHAR_MAX // [
# define WCHAR_MAX _UI16_MAX
#endif // WCHAR_MAX ]
#define WINT_MIN 0
#define WINT_MAX _UI16_MAX
#endif // __STDC_LIMIT_MACROS ]
// 7.18.4 Limits of other integer types
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
// 7.18.4.1 Macros for minimum-width integer constants
#define INT8_C(val) val##i8
#define INT16_C(val) val##i16
#define INT32_C(val) val##i32
#define INT64_C(val) val##i64
#define UINT8_C(val) val##ui8
#define UINT16_C(val) val##ui16
#define UINT32_C(val) val##ui32
#define UINT64_C(val) val##ui64
// 7.18.4.2 Macros for greatest-width integer constants
#define INTMAX_C INT64_C
#define UINTMAX_C UINT64_C
#endif // __STDC_CONSTANT_MACROS ]
#endif // _MSC_STDINT_H_ ]
\ No newline at end of file
///////////////////////////////////////////////////////////////////////////////////
/*
Copyright (c) 2006-2008,
Chris "Krishty" Maiwald, Alexander "Aramis" Gessler
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the class, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the Development Team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
///////////////////////////////////////////////////////////////////////////////////
#ifndef UM_HALF_H_INCLUDED
#define UM_HALF_H_INCLUDED
#include <limits>
#include <algorithm>
//#ifdef _MSC_VER
//#include "stdint.h"
//#else
//#include <stdint.h>
//#endif
#include<stdint.h>
#undef min
#undef max
///////////////////////////////////////////////////////////////////////////////////
/** 1. Represents a half-precision floating point value (16 bits) that behaves
* nearly conformant to the IEE 754 standard for floating-point computations.
*
* Not all operators have special implementations, most perform time-consuming
* conversions from half to float and back again.
* Differences to IEEE 754:
* - no difference between qnan and snan
* - no traps
* - no well-defined rounding mode
*/
///////////////////////////////////////////////////////////////////////////////////
class HalfFloat
{
friend HalfFloat operator+ (HalfFloat, HalfFloat);
friend HalfFloat operator- (HalfFloat, HalfFloat);
friend HalfFloat operator* (HalfFloat, HalfFloat);
friend HalfFloat operator/ (HalfFloat, HalfFloat);
public:
enum { BITS_MANTISSA = 10 };
enum { BITS_EXPONENT = 5 };
enum { MAX_EXPONENT_VALUE = 31 };
enum { BIAS = MAX_EXPONENT_VALUE / 2 };
enum { MAX_EXPONENT = BIAS };
enum { MIN_EXPONENT = -BIAS };
enum { MAX_EXPONENT10 = 9 };
enum { MIN_EXPONENT10 = -9 };
public:
/** Default constructor. Unitialized by default.
*/
inline HalfFloat() {}
/** Construction from an existing half
*/
inline HalfFloat(const HalfFloat& other)
: bits(other.GetBits())
{}
/** Construction from existing values for mantissa, sign
* and exponent. No validation is performed.
* @note The exponent is unsigned and biased by #BIAS
*/
inline HalfFloat(uint16_t _m, uint16_t _e, uint16_t _s);
/** Construction from a single-precision float
*/
inline HalfFloat(float other);
/** Construction from a double-precision float
*/
inline HalfFloat(const double);
/** Conversion operator to convert from half to float
*/
inline operator float() const;
/** Conversion operator to convert from half to double
*/
inline operator double() const;
/** Assignment operator to assign another half to
* *this* object.
*/
inline HalfFloat& operator= (HalfFloat other);
inline HalfFloat& operator= (float other);
inline HalfFloat& operator= (const double other);
/** Comparison operators
*/
inline bool operator== (HalfFloat other) const;
inline bool operator!= (HalfFloat other) const;
/** Relational comparison operators
*/
inline bool operator< (HalfFloat other) const;
inline bool operator> (HalfFloat other) const;
inline bool operator<= (HalfFloat other) const;
inline bool operator>= (HalfFloat other) const;
inline bool operator< (float other) const;
inline bool operator> (float other) const;
inline bool operator<= (float other) const;
inline bool operator>= (float other) const;
/** Combined assignment operators
*/
inline HalfFloat& operator += (HalfFloat other);
inline HalfFloat& operator -= (HalfFloat other);
inline HalfFloat& operator *= (HalfFloat other);
inline HalfFloat& operator /= (HalfFloat other);
inline HalfFloat& operator += (float other);
inline HalfFloat& operator -= (float other);
inline HalfFloat& operator *= (float other);
inline HalfFloat& operator /= (float other);
/** Post and prefix increment operators
*/
inline HalfFloat& operator++();
inline HalfFloat operator++(int);
/** Post and prefix decrement operators
*/
inline HalfFloat& operator--();
inline HalfFloat operator--(int);
/** Unary minus operator
*/
inline HalfFloat operator-() const;
/** Provides direct access to the bits of a half float
*/
inline uint16_t GetBits() const;
inline uint16_t& GetBits();
/** Classification of floating-point types
*/
inline bool IsNaN() const;
inline bool IsInfinity() const;
inline bool IsDenorm() const;
/** Returns the sign of the floating-point value -
* true stands for positive.
*/
inline bool GetSign() const;
public:
union
{
uint16_t bits; // All bits
struct
{
uint16_t Frac : 10; // mantissa
uint16_t Exp : 5; // exponent
uint16_t Sign : 1; // sign
} IEEE;
};
union IEEESingle
{
float Float;
struct
{
uint32_t Frac : 23;
uint32_t Exp : 8;
uint32_t Sign : 1;
} IEEE;
};
union IEEEDouble
{
double Double;
struct {
uint64_t Frac : 52;
uint64_t Exp : 11;
uint64_t Sign : 1;
} IEEE;
};
// Enums can not store 64 bit values, so we have to use static constants.
static const uint64_t IEEEDouble_MaxExpontent = 0x7FF;
static const uint64_t IEEEDouble_ExponentBias = IEEEDouble_MaxExpontent / 2;
};
/** 2. Binary operations
*/
inline HalfFloat operator+ (HalfFloat one, HalfFloat two);
inline HalfFloat operator- (HalfFloat one, HalfFloat two);
inline HalfFloat operator* (HalfFloat one, HalfFloat two);
inline HalfFloat operator/ (HalfFloat one, HalfFloat two);
inline float operator+ (HalfFloat one, float two);
inline float operator- (HalfFloat one, float two);
inline float operator* (HalfFloat one, float two);
inline float operator/ (HalfFloat one, float two);
inline float operator+ (float one, HalfFloat two);
inline float operator- (float one, HalfFloat two);
inline float operator* (float one, HalfFloat two);
inline float operator/ (float one, HalfFloat two);
///////////////////////////////////////////////////////////////////////////////////
/** 3. Specialization of std::numeric_limits for type half.
*/
///////////////////////////////////////////////////////////////////////////////////
namespace std {
template <>
class numeric_limits<HalfFloat> {
public:
// General -- meaningful for all specializations.
static const bool is_specialized = true;
static HalfFloat min()
{
return HalfFloat(0, 1, 0);
}
static HalfFloat max()
{
return HalfFloat(~0, HalfFloat::MAX_EXPONENT_VALUE - 1, 0);
}
static const int radix = 2;
static const int digits = 10; // conservative assumption
static const int digits10 = 2; // conservative assumption
static const bool is_signed = true;
static const bool is_integer = true;
static const bool is_exact = false;
static const bool traps = false;
static const bool is_modulo = false;
static const bool is_bounded = true;
// Floating point specific.
static HalfFloat epsilon()
{
return HalfFloat(0.00097656f);
} // from OpenEXR, needs to be confirmed
static HalfFloat round_error()
{
return HalfFloat(0.00097656f / 2);
}
static const int min_exponent10 = HalfFloat::MIN_EXPONENT10;
static const int max_exponent10 = HalfFloat::MAX_EXPONENT10;
static const int min_exponent = HalfFloat::MIN_EXPONENT;
static const int max_exponent = HalfFloat::MAX_EXPONENT;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const bool is_iec559 = false;
static const bool has_denorm = denorm_present;
static const bool tinyness_before = false;
static const float_round_style round_style = round_to_nearest;
static HalfFloat denorm_min()
{
return HalfFloat(1, 0, 1);
}
static HalfFloat infinity()
{
return HalfFloat(0, HalfFloat::MAX_EXPONENT_VALUE, 0);
}
static HalfFloat quiet_NaN()
{
return HalfFloat(1, HalfFloat::MAX_EXPONENT_VALUE, 0);
}
static HalfFloat signaling_NaN()
{
return HalfFloat(1, HalfFloat::MAX_EXPONENT_VALUE, 0);
}
};
} // end namespace std
#include "./umHalf.inl"
#ifndef UM_HALF_NO_TYPEDEFS
typedef HalfFloat float16;
typedef HalfFloat halfCPU;
#endif
#endif // !! UM_HALF_H_INCLUDED
///////////////////////////////////////////////////////////////////////////////////
/*
Copyright (c) 2006-2008, Alexander Gessler
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the ASSIMP team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the ASSIMP Development Team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
///////////////////////////////////////////////////////////////////////////////////
#ifndef UM_HALF_INL_INCLUDED
#define UM_HALF_INL_INCLUDED
#ifdef _MSC_VER
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)
#endif
// ------------------------------------------------------------------------------------------------
inline HalfFloat::HalfFloat(float other)
{
IEEESingle f;
f.Float = other;
IEEE.Sign = f.IEEE.Sign;
if (!f.IEEE.Exp)
{
IEEE.Frac = 0;
IEEE.Exp = 0;
}
else if (f.IEEE.Exp == 0xff)
{
// NaN or INF
IEEE.Frac = (f.IEEE.Frac != 0) ? 1 : 0;
IEEE.Exp = 31;
}
else
{
// regular number
int new_exp = f.IEEE.Exp - 127;
if (new_exp < -24)
{ // this maps to 0
IEEE.Frac = 0;
IEEE.Exp = 0;
}
else if (new_exp < -14)
{
// this maps to a denorm
IEEE.Exp = 0;
unsigned int exp_val = (unsigned int)(-14 - new_exp); // 2^-exp_val
switch (exp_val)
{
case 0:
IEEE.Frac = 0;
break;
case 1: IEEE.Frac = 512 + (f.IEEE.Frac >> 14); break;
case 2: IEEE.Frac = 256 + (f.IEEE.Frac >> 15); break;
case 3: IEEE.Frac = 128 + (f.IEEE.Frac >> 16); break;
case 4: IEEE.Frac = 64 + (f.IEEE.Frac >> 17); break;
case 5: IEEE.Frac = 32 + (f.IEEE.Frac >> 18); break;
case 6: IEEE.Frac = 16 + (f.IEEE.Frac >> 19); break;
case 7: IEEE.Frac = 8 + (f.IEEE.Frac >> 20); break;
case 8: IEEE.Frac = 4 + (f.IEEE.Frac >> 21); break;
case 9: IEEE.Frac = 2 + (f.IEEE.Frac >> 22); break;
case 10: IEEE.Frac = 1; break;
}
}
else if (new_exp > 15)
{ // map this value to infinity
IEEE.Frac = 0;
IEEE.Exp = 31;
}
else
{
IEEE.Exp = new_exp + 15;
IEEE.Frac = (f.IEEE.Frac >> 13);
}
}
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat::HalfFloat(const double p_Reference)
{
const IEEEDouble & l_Reference = reinterpret_cast<const IEEEDouble &>(p_Reference);
// Copy the sign bit.
this->IEEE.Sign = l_Reference.IEEE.Sign;
// Check for special values: Is the exponent zero?
if (0 == l_Reference.IEEE.Exp)
{
// A zero exponent indicates either a zero or a subnormal number. A subnormal float can not
// be represented as a half, so either one will be saved as a zero.
this->IEEE.Exp = 0;
this->IEEE.Frac = 0;
}
// Is the exponent all one?
else if (IEEEDouble_MaxExpontent == l_Reference.IEEE.Exp)
{
this->IEEE.Exp = MAX_EXPONENT_VALUE;
// A zero fraction indicates an Infinite value.
if (0 == l_Reference.IEEE.Frac)
this->IEEE.Frac = 0;
// A nonzero fraction indicates NaN. Such a fraction contains further information, e.g. to
// distinguish a QNaN from a SNaN. However, we can not just shift-copy the fraction:
// if the first five bits were zero we would save an infinite value, so we abandon the
// fraction information and set it to a nonzero value.
else
this->IEEE.Frac = 1;
}
// A usual value?
else {
// First, we have to adjust the exponent. It is stored as an unsigned int, to reconstruct
// its original value we have to subtract its bias (half of its range).
const int64_t l_AdjustedExponent = l_Reference.IEEE.Exp - IEEEDouble_ExponentBias;
// Very small values will be rounded to zero.
if (-24 > l_AdjustedExponent)
{
this->IEEE.Frac = 0;
this->IEEE.Exp = 0;
}
// Some small values can be stored as subnormal values.
else if (-14 > l_AdjustedExponent)
{
// The exponent of subnormal values is always be zero.
this->IEEE.Exp = 0;
// The exponent will now be stored in the fraction.
const int16_t l_NewExponent = int16_t(-14 - l_AdjustedExponent); // 2 ^ -l_NewExponent
this->IEEE.Frac = (1024 >> l_NewExponent) + int16_t(l_Reference.IEEE.Frac >> (42 + l_NewExponent));
}
// Very large numbers will be rounded to infinity.
else if (15 < l_AdjustedExponent)
{
// Exponent all one, fraction zero.
this->IEEE.Exp = MAX_EXPONENT_VALUE;
this->IEEE.Frac = 0;
}
// All remaining numbers can be converted directly.
else
{
// We reconstructed the exponent by subtracting the bias. To store it as an unsigned
// int, we need to add the bias again.
this->IEEE.Exp = l_AdjustedExponent + BIAS;
// When storing the fraction, we abandon its least significant bits by right-shifting.
// The fraction of a double is 42 bits wider than that of a half, so we shift 42 bits.
this->IEEE.Frac = (l_Reference.IEEE.Frac >> 42);
};
}; // else usual number
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat::HalfFloat(uint16_t _m, uint16_t _e, uint16_t _s)
{
IEEE.Frac = _m;
IEEE.Exp = _e;
IEEE.Sign = _s;
}
// ------------------------------------------------------------------------------------------------
HalfFloat::operator float() const
{
IEEESingle sng;
sng.IEEE.Sign = IEEE.Sign;
if (!IEEE.Exp)
{
if (!IEEE.Frac)
{
sng.IEEE.Frac = 0;
sng.IEEE.Exp = 0;
}
else
{
const float half_denorm = (1.0f / 16384.0f);
float mantissa = ((float)(IEEE.Frac)) / 1024.0f;
float sgn = (IEEE.Sign) ? -1.0f : 1.0f;
sng.Float = sgn * mantissa*half_denorm;
}
}
else if (31 == IEEE.Exp)
{
sng.IEEE.Exp = 0xff;
sng.IEEE.Frac = (IEEE.Frac != 0) ? 1 : 0;
}
else
{
sng.IEEE.Exp = IEEE.Exp + 112;
sng.IEEE.Frac = (IEEE.Frac << 13);
}
return sng.Float;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat::operator double(void) const
{
IEEEDouble l_Result;
// Copy the sign bit.
l_Result.IEEE.Sign = this->IEEE.Sign;
// In a zero, both the exponent and the fraction are zero.
if ((0 == this->IEEE.Exp) && (0 == this->IEEE.Frac))
{
l_Result.IEEE.Exp = 0;
l_Result.IEEE.Frac = 0;
}
// If the exponent is zero and the fraction is nonzero, the number is subnormal.
else if ((0 == this->IEEE.Exp) && (0 != this->IEEE.Frac))
{
// sign * 2^-14 * fraction
l_Result.Double = (this->IEEE.Sign ? -1.0 : +1.0) / 16384.0 * (double(this->IEEE.Frac) / 1024.0);
}
// Is the exponent all one?
else if (MAX_EXPONENT_VALUE == this->IEEE.Exp)
{
l_Result.IEEE.Exp = IEEEDouble_MaxExpontent;
// A zero fraction indicates an infinite value.
if (0 == this->IEEE.Frac)
l_Result.IEEE.Frac = 0;
// A nonzero fraction indicates a NaN. We can re-use the fraction information: a double
// fraction is 42 bits wider than a half fraction, so we can just left-shift it. Any
// information on QNaNs or SNaNs will be preserved.
else
l_Result.IEEE.Frac = uint64_t(this->IEEE.Frac) << 42;
}
// A usual value?
else
{
// The exponent is stored as an unsigned int. To reconstruct its original value, we have to
// subtract its bias. To re-store it in a wider bit field, we must add the bias of the new
// bit field.
l_Result.IEEE.Exp = uint64_t(this->IEEE.Exp) - BIAS + IEEEDouble_ExponentBias;
// A double fraction is 42 bits wider than a half fraction, so we can just left-shift it.
l_Result.IEEE.Frac = uint64_t(this->IEEE.Frac) << 42;
}
return l_Result.Double;
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::IsNaN() const
{
return IEEE.Frac != 0 && IEEE.Exp == MAX_EXPONENT_VALUE;
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::IsInfinity() const
{
return IEEE.Frac == 0 && IEEE.Exp == MAX_EXPONENT_VALUE;
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::IsDenorm() const
{
return IEEE.Exp == 0;
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::GetSign() const
{
return IEEE.Sign == 0;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator= (HalfFloat other)
{
bits = other.GetBits();
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator= (float other)
{
*this = (HalfFloat)other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator= (const double p_Reference)
{
return (*this) = HalfFloat(p_Reference);
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator== (HalfFloat other) const
{
// +0 and -0 are considered to be equal
if (!(bits << 1u) && !(other.bits << 1u))return true;
return bits == other.bits && !this->IsNaN();
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator!= (HalfFloat other) const
{
// +0 and -0 are considered to be equal
if (!(bits << 1u) && !(other.bits << 1u))return false;
return bits != other.bits || this->IsNaN();
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator< (HalfFloat other) const
{
// NaN comparisons are always false
if (this->IsNaN() || other.IsNaN())
return false;
// this works since the segment oder is s,e,m.
return (int16_t)this->bits < (int16_t)other.GetBits();
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator> (HalfFloat other) const
{
// NaN comparisons are always false
if (this->IsNaN() || other.IsNaN())
return false;
// this works since the segment oder is s,e,m.
return (int16_t)this->bits > (int16_t)other.GetBits();
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator<= (HalfFloat other) const
{
return !(*this > other);
}
// ------------------------------------------------------------------------------------------------
inline bool HalfFloat::operator>= (HalfFloat other) const
{
return !(*this < other);
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator += (HalfFloat other)
{
*this = (*this) + other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator -= (HalfFloat other)
{
*this = (*this) - other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator *= (HalfFloat other)
{
*this = (float)(*this) * (float)other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator /= (HalfFloat other)
{
*this = (float)(*this) / (float)other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator += (float other)
{
*this = (*this) + (HalfFloat)other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator -= (float other)
{
*this = (*this) - (HalfFloat)other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator *= (float other)
{
*this = (float)(*this) * other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator /= (float other)
{
*this = (float)(*this) / other;
return *this;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator++()
{
// setting the exponent to bias means using 0 as exponent - thus we
// can set the mantissa to any value we like, we'll always get 1.0
return this->operator+=(HalfFloat(0, BIAS, 0));
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat HalfFloat::operator++(int)
{
HalfFloat f = *this;
this->operator+=(HalfFloat(0, BIAS, 0));
return f;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat& HalfFloat::operator--()
{
return this->operator-=(HalfFloat(0, BIAS, 0));
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat HalfFloat::operator--(int)
{
HalfFloat f = *this;
this->operator-=(HalfFloat(0, BIAS, 0));
return f;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat HalfFloat::operator-() const
{
return HalfFloat(IEEE.Frac, IEEE.Exp, ~IEEE.Sign);
}
// ------------------------------------------------------------------------------------------------
inline uint16_t HalfFloat::GetBits() const
{
return bits;
}
// ------------------------------------------------------------------------------------------------
inline uint16_t& HalfFloat::GetBits()
{
return bits;
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat operator+ (HalfFloat one, HalfFloat two)
{
#if (!defined HALFFLOAT_NO_CUSTOM_IMPLEMENTATIONS)
if (one.IEEE.Exp == HalfFloat::MAX_EXPONENT_VALUE)
{
// if one of the components is NaN the result becomes NaN, too.
if (0 != one.IEEE.Frac || two.IsNaN())
return HalfFloat(1, HalfFloat::MAX_EXPONENT_VALUE, 0);
// otherwise this must be infinity
return HalfFloat(0, HalfFloat::MAX_EXPONENT_VALUE, one.IEEE.Sign | two.IEEE.Sign);
}
else if (two.IEEE.Exp == HalfFloat::MAX_EXPONENT_VALUE)
{
if (one.IsNaN() || 0 != two.IEEE.Frac)
return HalfFloat(1, HalfFloat::MAX_EXPONENT_VALUE, 0);
return HalfFloat(0, HalfFloat::MAX_EXPONENT_VALUE, one.IEEE.Sign | two.IEEE.Sign);
}
HalfFloat out;
long m1, m2, temp;
// compute the difference between the two exponents. shifts with negative
// numbers are undefined, thus we need two code paths
register int expDiff = one.IEEE.Exp - two.IEEE.Exp;
if (0 == expDiff)
{
// the exponents are equal, thus we must just add the hidden bit
temp = two.IEEE.Exp;
if (0 == one.IEEE.Exp)m1 = one.IEEE.Frac;
else m1 = (int)one.IEEE.Frac | (1 << HalfFloat::BITS_MANTISSA);
if (0 == two.IEEE.Exp)m2 = two.IEEE.Frac;
else m2 = (int)two.IEEE.Frac | (1 << HalfFloat::BITS_MANTISSA);
}
else
{
if (expDiff < 0)
{
expDiff = -expDiff;
std::swap(one, two);
}
m1 = (int)one.IEEE.Frac | (1 << HalfFloat::BITS_MANTISSA);
if (0 == two.IEEE.Exp)m2 = two.IEEE.Frac;
else m2 = (int)two.IEEE.Frac | (1 << HalfFloat::BITS_MANTISSA);
if (expDiff < ((sizeof(long) << 3) - (HalfFloat::BITS_MANTISSA + 1)))
{
m1 <<= expDiff;
temp = two.IEEE.Exp;
}
else
{
if (0 != two.IEEE.Exp)
{
// arithmetic underflow
if (expDiff > HalfFloat::BITS_MANTISSA)return HalfFloat(0, 0, 0);
else
{
m2 >>= expDiff;
}
}
temp = one.IEEE.Exp;
}
}
// convert from sign-bit to two's complement representation
if (one.IEEE.Sign)m1 = -m1;
if (two.IEEE.Sign)m2 = -m2;
m1 += m2;
if (m1 < 0)
{
out.IEEE.Sign = 1;
m1 = -m1;
}
else out.IEEE.Sign = 0;
// and renormalize the result to fit in a half
if (0 == m1)return HalfFloat(0, 0, 0);
#ifdef _MSC_VER
_BitScanReverse((unsigned long*)&m2, m1);
#else
m2 = __builtin_clz(m1);
#endif
expDiff = m2 - HalfFloat::BITS_MANTISSA;
temp += expDiff;
if (expDiff >= HalfFloat::MAX_EXPONENT_VALUE)
{
// arithmetic overflow. return INF and keep the sign
return HalfFloat(0, HalfFloat::MAX_EXPONENT_VALUE, out.IEEE.Sign);
}
else if (temp <= 0)
{
// this maps to a denorm
m1 <<= (-expDiff - 1);
temp = 0;
}
else
{
// rebuild the normalized representation, take care of the hidden bit
if (expDiff < 0)m1 <<= (-expDiff);
else m1 >>= expDiff; // m1 >= 0
}
out.IEEE.Frac = m1;
out.IEEE.Exp = temp;
return out;
#else
return HalfFloat((float)one + (float)two);
#endif
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat operator- (HalfFloat one, HalfFloat two)
{
return HalfFloat(one + (-two));
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat operator* (HalfFloat one, HalfFloat two)
{
return HalfFloat((float)one * (float)two);
}
// ------------------------------------------------------------------------------------------------
inline HalfFloat operator/ (HalfFloat one, HalfFloat two)
{
return HalfFloat((float)one / (float)two);
}
// ------------------------------------------------------------------------------------------------
inline float operator+ (HalfFloat one, float two)
{
return (float)one + two;
}
// ------------------------------------------------------------------------------------------------
inline float operator- (HalfFloat one, float two)
{
return (float)one - two;
}
// ------------------------------------------------------------------------------------------------
inline float operator* (HalfFloat one, float two)
{
return (float)one * two;
}
// ------------------------------------------------------------------------------------------------
inline float operator/ (HalfFloat one, float two)
{
return (float)one / two;
}
// ------------------------------------------------------------------------------------------------
inline float operator+ (float one, HalfFloat two)
{
return two + one;
}
// ------------------------------------------------------------------------------------------------
inline float operator- (float one, HalfFloat two)
{
return two - one;
}
// ------------------------------------------------------------------------------------------------
inline float operator* (float one, HalfFloat two)
{
return two * one;
}
// ------------------------------------------------------------------------------------------------
inline float operator/ (float one, HalfFloat two)
{
return two / one;
}
#endif //!! UM_HALF_INL_INCLUDED
halfa: order=2 dimsize=4,4 dtype=X_FLOAT16 dense=1.000000
bc68 342d ae59 bcd7 b46a 3c1c 2c25 beb9 bcaf 3d72 3fc2 38d0 bd6b bce4 3854 ad13
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <stdio.h>
#include <direct.h>
#include "../../core/CHeader.h"
#include "../../core/utilities/FlushToMem.h"
#include "../../core/getandset/ConvertDataType.h"
#include "../../XTensor.h"
#include "../../XGlobal.h"
using namespace nts;
int main(int argc, const char ** argv) {
char *path;
path = getcwd(NULL, 0);
strcat(path, "\\source\\tensor\\halfLib\\half\\dump");
int dim = 4;
int devId = 0;
XTensor a;
XTensor b;
XTensor c;
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor2DV2(&a, dim, dim, X_FLOAT, devId);
InitTensor2DV2(&c, dim, dim, X_FLOAT, devId);
InitTensor2DV2(&halfb, dim, dim, X_FLOAT16, devId);
a.SetDataRand(-2.0, 2.0);
c.SetDataRand(-2.0, 2.0);
halfa = ConvertDataType(a, X_FLOAT16);
halfc = ConvertDataType(c, X_FLOAT16);
printf("============save model================\n");
halfa.Dump(&halfa, stderr, "halfa:");
GPUToCPUFlush(&halfa);
FILE * file = fopen(path, "wb");
halfa.Dump(file, "halfa:");
//a.Dump(file, "a");
fclose(file);
printf("==============read model=============\n");
FILE *read = fopen(path, "rb");
halfb.Read(read, "halfa:");
//b.Read(read, "a");
fclose(read);
halfb.Dump(&halfb, stderr, "halfb:");
printf("==============BMMUL=============\n");
b = BMMul(a, X_NOTRANS, c, X_NOTRANS);
b.Dump(stderr,"b:");
printf("==============BMMUL-float=============\n");
halfa= BMMul(halfb, X_NOTRANS, halfc, X_NOTRANS);
halfa.Dump(&halfa, stderr, "halfla:");
return 0;
}
\ No newline at end of file
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <time.h>
#include <cuda_fp16.h>
//#ifndef HALF_ARITHMETIC_TYPE
//#define HALF_ARITHMETIC_TYPE
//#endif // !HALF_ARITHMETIC_TYPE
#include "half.hpp"
using half_float::halfFloat;
typedef half_float::halfFloat halfC;
__global__ void matrixMulKernel(__half *C, __half *A, __half *B) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
C[i] = A[i] * B[i];
}
void constantInit(halfC *data, int size, halfC val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
void matrixMul() {
unsigned int N = 128;
unsigned int size = N * sizeof(halfC);
halfC *h_A = (halfC*)malloc(size);
halfC *h_B = (halfC*)malloc(size);
halfC *h_C = (halfC*)malloc(size);
halfC *h_D = (halfC*)malloc(size);
// Initialize host memory
const halfC valB = (halfC)0.01f;
constantInit(h_A, N, (halfC)1.0f);
constantInit(h_B, N, valB);
__half *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
//copy host memory to device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//config dims
dim3 block(16, 16);
dim3 grid(N / block.x, N / block.y);
// Excute the kernel
matrixMulKernel << <grid, block >> > (d_C, d_A, d_B);
// Copy the memory from device to host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//printf("Checking computed result for correctness: ");
//bool correct = true;
//// test relative error by the formula
//// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
//double eps = 1.e-6; // machine zero
for (int k = 0; k < N; k++) {
h_D[k] = h_A[k] * h_B[k];
}
for (int i = 0; i < N; i++) {
printf("%hx--%hx ", h_C[i], h_D[i]);
if ((i + 1) % 8 == 0)
printf("\n");
}
//for (int i = 0; i < width*height; i++) {
// double abs_err = fabs(h_C[i] - (width * valB));
// double dot_length = width;
// double abs_val = fabs(h_C[i]);
// double rel_err = abs_err / abs_val / dot_length;
// if (rel_err > eps)
// {
// printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);
// correct = false;
// }
//}
//printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// Free
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
int main() {
matrixMul();
}
//
//#define THREAD_NUM 256
//#define MATRIX_SIZE 4
//const halfC blocks_num = MATRIX_SIZE * (MATRIX_SIZE + THREAD_NUM - 1) / THREAD_NUM;
//
//__global__ static void matMultCUDA(const __half* a, const __half* b, __half* c, halfC n, clock_t* time)
//{
//
// //表示目前的 thread 是第几个 thread(由 0 开始计算)
// const halfC tid = threadIdx.x;
//
// //表示目前的 thread 属于第几个 block(由 0 开始计算)
// const halfC bid = blockIdx.x;
//
// //从 bid 和 tid 计算出这个 thread 应该计算的 row 和 column
// const halfC idx = bid * THREAD_NUM + tid;
// const halfC row = idx / n;
// const halfC column = idx % n;
//
// halfC i;
//
// //记录运算开始的时间
// clock_t start;
//
// //只在 thread 0(即 threadIdx.x = 0 的时候)进行记录,每个 block 都会记录开始时间及结束时间
// if (tid == 0)
// time[bid] = clock();
//
// //计算矩阵乘法
// if (row < n && column < n)
// {
// __half t = __half(0.0);
// for (i = 0; i < n; i++)
// {
// t += a[row * n + i] * b[i * n + column];
// }
// c[row * n + column] = t;
// }
//
// //计算时间,记录结果,只在 thread 0(即 threadIdx.x = 0 的时候)进行,每个 block 都会记录开始时间及结束时间
// if (tid == 0)
// {
// time[bid + blocks_num] = clock();
// }
//}
//
//bool InitCuda() {
// halfC count;
// halfC device;
// cudaGetDeviceCount(&count);
// if (count == 0) {
// fprhalfCf(stderr, "There is no device !\n");
// }
// else
// device = 1;
// cudaSetDevice(device);
// return true;
//}
//template <class T >
//void matgen(T *a, halfC n) {
// halfC i, j;
// for (i = 0; i < n; i++) {
// for (j = 0; j < n; j++) {
// a[i * n + j] = (T)rand() / (0x7FFF) + (halfC)rand() / (0x7FFF * 0x7FFF);
// }
// }
//}
//
//
//halfC main(halfC argc, char **argv) {
//
// //CUDA 初始化
// if (!InitCuda())
// return 0;
//
// //定义矩阵
// halfC *a, *b, *c, *d;
//
// halfC n = MATRIX_SIZE;
//
// //分配内存
// a = (halfC*)malloc(sizeof(halfC)* n * n);
// b = (halfC*)malloc(sizeof(halfC)* n * n);
// c = (halfC*)malloc(sizeof(halfC)* n * n);
// d = (halfC*)malloc(sizeof(halfC)* n * n);
//
// //设置随机数种子
// srand(0);
//
// //随机生成矩阵
// matgen(a, n);
// matgen(b, n);
//
// for (halfC i = 0; i < n; i++)
// {
// for (halfC j = 0; j < n; j++)
// {
// prhalfCf("%x ", a[i * n + j]);
// }
// prhalfCf("\n");
// }
//
// ///*把数据复制到显卡内存中*/
// __half *cuda_a, *cuda_b, *cuda_c;
//
// clock_t* time;
//
// //cudaMalloc 取得一块显卡内存
// cudaMalloc((void**)&cuda_a, sizeof(__half)* n * n);
// cudaMalloc((void**)&cuda_b, sizeof(__half)* n * n);
// cudaMalloc((void**)&cuda_c, sizeof(__half)* n * n);
//
// cudaMalloc((void**)&time, sizeof(clock_t)* blocks_num * 2);
//
// //cudaMemcpy 将产生的矩阵复制到显卡内存中
// //cudaMemcpyHostToDevice - 从内存复制到显卡内存
// //cudaMemcpyDeviceToHost - 从显卡内存复制到内存
// cudaMemcpy(cuda_a, a, sizeof(__half)* n * n, cudaMemcpyHostToDevice);
// cudaMemcpy(cuda_b, b, sizeof(__half)* n * n, cudaMemcpyHostToDevice);
//
// // 在CUDA 中执行函数 语法:函数名称<<<block 数目, thread 数目, shared memory 大小>>>(参数...);
// matMultCUDA << < blocks_num, THREAD_NUM, 0 >> > (cuda_a, cuda_b, cuda_c, n, time);
//
// /*把结果从显示芯片复制回主内存*/
//
// clock_t time_use[blocks_num * 2];
//
// //cudaMemcpy 将结果从显存中复制回内存
// cudaMemcpy(c, cuda_c, sizeof(halfC)* n * n, cudaMemcpyDeviceToHost);
// cudaMemcpy(&time_use, time, sizeof(clock_t)* blocks_num * 2, cudaMemcpyDeviceToHost);
//
// for (halfC i = 0; i < n; i++)
// {
// for (halfC j = 0; j < n; j++)
// {
// prhalfCf("%x ", c[i * n + j]);
// }
// prhalfCf("\n");
// }
//
// //Free cuda
// cudaFree(cuda_a);
// cudaFree(cuda_b);
// cudaFree(cuda_c);
// cudaFree(time);
////把每个 block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
//clock_t min_start, max_end;
//min_start = time_use[0];
//max_end = time_use[blocks_num];
//for (halfC i = 1; i < blocks_num; i++)
//{
// if (min_start > time_use[i]) min_start = time_use[i];
// if (max_end < time_use[i + blocks_num]) max_end = time_use[i + blocks_num];
//}
////核函数运行时间
//clock_t final_time = max_end - min_start;
////CPU矩阵乘法,存入矩阵d
//for (halfC i = 0; i < n; i++)
//{
// for (halfC j = 0; j < n; j++)
// {
// double t = 0;
// for (halfC k = 0; k < n; k++){
// t += a[i * n + k] * b[k * n + j];
// }
// d[i * n + j] = t;
// }
//}
////验证正确性与精确性
//halfC max_err = (halfC)0.0;
//halfC average_err = (halfC)0;
//for (halfC i = 0; i < n; i++)
//{
// for (halfC j = 0; j < n; j++)
// {
// if (d[i * n + j] != 0)
// {
// //fabs求浮点数x的绝对值
// halfC err = fabs((c[i * n + j] - d[i * n + j]) / d[i * n + j]);
// if (max_err < err) max_err = err;
// average_err += err;
// }
// }
//}
//prhalfCf("Max error: %g Average error: %g\n", max_err, average_err / (n * n));
//prhalfCf("gputime: %d\n", final_time);
//
// return 0;
//}
\ No newline at end of file
// test - Test application for half-precision floating point functionality.
//
// Copyright (c) 2012-2019 Christian Rau <rauy@users.sourceforge.net>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//#define HALF_ENABLE_F16C_INTRINSICS 1
//#define HALF_ARITHMETIC_TYPE float
#define HALF_ROUND_STYLE 1
#include "half.hpp"
#include <utility>
#include <vector>
#include <string>
#include <map>
#include <set>
#include <iostream>
#include <iomanip>
#include <memory>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <functional>
#include <fstream>
#include <random>
#include <bitset>
#include <limits>
#include <chrono>
#include <typeinfo>
#include <stdexcept>
#include <cstdint>
#include <cmath>
#if HALF_ENABLE_CPP11_HASH
#include <unordered_map>
#endif
#if HALF_ENABLE_CPP11_CMATH && !defined(HALF_ARITHMETIC_TYPE)
#include <cfenv>
#pragma STDC FENV_ACCESS ON
#endif
int ilog2(int i)
{
unsigned int l = 0;
for (; i > 0; i >>= 1, ++l);
return l;
}
#define UNARY_PERFORMANCE_TEST(func, x, N) { \
auto start = std::chrono::high_resolution_clock::now(); \
for(unsigned int i=0; i<N; ++i) for(unsigned int h=0; h<x.size(); ++h) results[h] = func(x[h]); \
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()-start).count(); \
log_ << #func << "\tx " << N << ":\t" << tm << "\n\n"; if(csv_) *csv_ << #func << ';' << tm << '\n'; }
#define BINARY_PERFORMANCE_TEST(func, x, y, N) { \
auto start = std::chrono::high_resolution_clock::now(); \
for(unsigned int i=0; i<x.size(); i+=N) for(unsigned int j=0; j<y.size(); j+=N) results[j] = func(x[i], y[j]); \
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()-start).count(); \
log_ << #func << "\t@ 1/" << (N*N) << ":\t" << tm << "\n\n"; if(csv_) *csv_ << #func << ';' << tm << '\n'; }
#define OPERATOR_PERFORMANCE_TEST(op, x, y, N) { \
auto start = std::chrono::high_resolution_clock::now(); \
for(unsigned int i=0; i<x.size(); i+=N) for(unsigned int j=0; j<y.size(); j+=N) results[j] = x[i] op y[j]; \
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()-start).count(); \
log_ << #op << "\t@ 1/" << (N*N) << ":\t" << tm << "\n\n"; if(csv_) *csv_ << #op << ';' << tm << '\n'; }
#define TERNARY_PERFORMANCE_TEST(func, x, y, z, N) { \
auto start = std::chrono::high_resolution_clock::now(); \
for(unsigned int i=0; i<x.size(); i+=N) for(unsigned int j=0; j<y.size(); j+=N) for(unsigned int k=0; k<z.size(); k+=N) results[k] = func(x[i], y[j], z[k]); \
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()-start).count(); \
log_ << #func << "\t@ 1/" << (N*N*N) << ":\t" << tm << "\n\n"; if(csv_) *csv_ << #func << ';' << tm << '\n'; }
using half_float::half;
using half_float::half_cast;
#if HALF_ENABLE_CPP11_USER_LITERALS
using namespace half_float::literal;
#endif
half b2h(std::uint16_t bits)
{
return *reinterpret_cast<half*>(&bits);
}
std::uint16_t h2b(half h)
{
return *reinterpret_cast<std::uint16_t*>(&h);
}
bool comp(half a, half b)
{
return (isnan(a) && isnan(b)) || h2b(a) == h2b(b);
}
bool compz(half a, half b)
{
return (isnan(a) && isnan(b)) || a == b;
}
template<std::float_round_style R> half select(const std::pair<half, half> &hh)
{
return (R == std::round_toward_zero && abs(hh.first) > abs(hh.second)) ||
(R == std::round_toward_infinity && hh.second > hh.first) ||
(R == std::round_toward_neg_infinity && hh.second <= hh.first) ?
hh.second : hh.first;
}
class half_test
{
public:
half_test(std::ostream &log, std::ostream *csv, bool fast, bool rough)
: tests_(0), log_(log), csv_(csv), fast_(fast), rough_(rough)
{
//prepare halfs
half_vector batch;
std::uint16_t u = 0;
halfs_.insert(std::make_pair("positive zero", half_vector(1, b2h(u++))));
for (; u < 0x400; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("positive subn", std::move(batch)));
batch.clear();
for (; u < 0x7C00; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("positive norm", std::move(batch)));
batch.clear();
halfs_.insert(std::make_pair("positive inft", half_vector(1, b2h(u++))));
for (; u < 0x8000; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("positive NaN", std::move(batch)));
batch.clear();
halfs_.insert(std::make_pair("negative zero", half_vector(1, b2h(u++))));
for (; u < 0x8400; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("negative subn", std::move(batch)));
batch.clear();
for (; u < 0xFC00; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("negative norm", std::move(batch)));
batch.clear();
halfs_.insert(std::make_pair("negative inft", half_vector(1, b2h(u++))));
for (; u != 0; ++u)
batch.push_back(b2h(u));
halfs_.insert(std::make_pair("negative NaN", std::move(batch)));
//set classes
classes_["positive zero"] = FP_ZERO;
classes_["positive subn"] = FP_SUBNORMAL;
classes_["positive norm"] = FP_NORMAL;
classes_["positive inft"] = FP_INFINITE;
classes_["positive NaN"] = FP_NAN;
classes_["negative zero"] = FP_ZERO;
classes_["negative subn"] = FP_SUBNORMAL;
classes_["negative norm"] = FP_NORMAL;
classes_["negative inft"] = FP_INFINITE;
classes_["negative NaN"] = FP_NAN;
}
unsigned int test()
{
/*
//test size
simple_test("size", []() { return sizeof(half)*CHAR_BIT >= 16; });
//test conversion
unary_test("float conversion", [](half arg) { return comp(half_cast<half>(half_cast<float>(arg)), arg); });
unary_test("double conversion", [](half arg) { return comp(half_cast<half>(half_cast<double>(arg)), arg); });
unary_test("long double conversion", [](half arg) { return comp(half_cast<half>(half_cast<long double>(arg)), arg); });
//test classification
class_test("fpclassify", [](half arg, int cls) { return fpclassify(arg) == cls; });
class_test("isfinite", [](half arg, int cls) { return isfinite(arg) == (cls!=FP_INFINITE&&cls!=FP_NAN); });
class_test("isinf", [](half arg, int cls) { return isinf(arg) == (cls==FP_INFINITE); });
class_test("isnan", [](half arg, int cls) { return isnan(arg) == (cls==FP_NAN); });
class_test("isnormal", [](half arg, int cls) { return isnormal(arg) == (cls==FP_NORMAL); });
unary_test("signbit", [](half arg) -> bool { double f = arg; return isnan(arg) || f==0.0 || (signbit(arg)==(f<0.0)); });
//test operators
unary_test("prefix increment", [](half arg) -> bool { double f = half_cast<double>(arg);
return comp(static_cast<half>(++f), ++arg) && comp(half_cast<half>(f), arg); });
unary_test("prefix decrement", [](half arg) -> bool { double f = half_cast<double>(arg);
return comp(static_cast<half>(--f), --arg) && comp(half_cast<half>(f), arg); });
unary_test("postfix increment", [](half arg) -> bool { double f = half_cast<double>(arg);
return comp(static_cast<half>(f++), arg++) && comp(half_cast<half>(f), arg); });
unary_test("postfix decrement", [](half arg) -> bool { double f = half_cast<double>(arg);
return comp(static_cast<half>(f--), arg--) && comp(half_cast<half>(f), arg); });
unary_test("unary plus", [](half arg) { return comp(+arg, arg); });
unary_test("unary minus", [](half arg) { return comp(-arg, half_cast<half>(-half_cast<double>(arg))); });
binary_test("addition", [](half a, half b) { return comp(a+b, half_cast<half>(half_cast<double>(a)+half_cast<double>(b))); });
binary_test("subtraction", [](half a, half b) { return comp(a-b, half_cast<half>(half_cast<double>(a)-half_cast<double>(b))); });
binary_test("multiplication", [](half a, half b) { return comp(a*b, half_cast<half>(half_cast<double>(a)*half_cast<double>(b))); });
binary_test("division", [](half a, half b) { return comp(a/b, half_cast<half>(half_cast<double>(a)/half_cast<double>(b))); });
binary_test("equal", [](half a, half b) { return (a==b) == (half_cast<double>(a)==half_cast<double>(b)); });
binary_test("not equal", [](half a, half b) { return (a!=b) == (half_cast<double>(a)!=half_cast<double>(b)); });
binary_test("less", [](half a, half b) { return (a<b) == (half_cast<double>(a)<half_cast<double>(b)); });
binary_test("greater", [](half a, half b) { return (a>b) == (half_cast<double>(a)>half_cast<double>(b)); });
binary_test("less equal", [](half a, half b) { return (a<=b) == (half_cast<double>(a)<=half_cast<double>(b)); });
binary_test("greater equal", [](half a, half b) { return (a>=b) == (half_cast<double>(a)>=half_cast<double>(b)); });
//test basic functions
unary_test("abs", [](half arg) { return comp(abs(arg), half_cast<half>(std::abs(half_cast<double>(arg)))); });
unary_test("fabs", [](half arg) { return comp(fabs(arg), half_cast<half>(std::fabs(half_cast<double>(arg)))); });
binary_test("fmod", [](half x, half y) { return comp(fmod(x, y), half_cast<half>(std::fmod(half_cast<double>(x), half_cast<double>(y)))); });
binary_test("fdim", [](half a, half b) -> bool { half c = fdim(a, b); return isnan(a) || isnan(b) ||
(isinf(a) && isinf(b) && signbit(a)==signbit(b)) || ((a>b) && comp(c, a-b)) || ((a<=b) && comp(c, half_cast<half>(0.0))); });
ternary_test("fma", [](half x, half y, half z) { return comp(fma(x, y, z), half_cast<half>(half_cast<double>(x)*half_cast<double>(y)+half_cast<double>(z))); });
// ternary_reference_test("fma", half_float::fma);
//test exponential functions
unary_reference_test("exp", half_float::exp);
unary_reference_test("exp2", half_float::exp2);
unary_reference_test("expm1", half_float::expm1);
unary_reference_test("log", half_float::log);
unary_reference_test("log10", half_float::log10);
unary_reference_test("log1p", half_float::log1p);
unary_reference_test("log2", half_float::log2);
//test power functions
unary_reference_test("sqrt", half_float::sqrt);
unary_reference_test("cbrt", half_float::cbrt);
binary_reference_test("pow", half_float::pow);
binary_reference_test<half(half,half)>("hypot", half_float::hypot);
// ternary_reference_test<half(half,half,half)>("hypot3", half_float::hypot);
//test trig functions
unary_reference_test("sin", half_float::sin);
unary_reference_test("cos", half_float::cos);
unary_reference_test("tan", half_float::tan);
unary_reference_test("asin", half_float::asin);
unary_reference_test("acos", half_float::acos);
unary_reference_test("atan", half_float::atan);
binary_reference_test("atan2", half_float::atan2);
//test hyp functions
unary_reference_test("sinh", half_float::sinh);
unary_reference_test("cosh", half_float::cosh);
unary_reference_test("tanh", half_float::tanh);
unary_reference_test("asinh", half_float::asinh);
unary_reference_test("acosh", half_float::acosh);
unary_reference_test("atanh", half_float::atanh);
//test err functions
unary_reference_test("erf", half_float::erf);
unary_reference_test("erfc", half_float::erfc);
unary_reference_test("lgamma", half_float::lgamma);
unary_reference_test("tgamma", half_float::tgamma);
//test round functions
unary_test("ceil", [](half arg) { return comp(ceil(arg), half_cast<half>(std::ceil(half_cast<double>(arg)))); });
unary_test("floor", [](half arg) { return comp(floor(arg), half_cast<half>(std::floor(half_cast<double>(arg)))); });
unary_test("trunc", [](half arg) { return !isfinite(arg) || compz(trunc(arg), half_cast<half>(static_cast<int>(arg))); });
unary_test("round", [](half arg) { return !isfinite(arg) || compz(round(arg),
half_cast<half>(static_cast<int>(static_cast<double>(arg)+(signbit(arg) ? -0.5 : 0.5)))); });
unary_test("lround", [](half arg) { return !isfinite(arg) || lround(arg) ==
static_cast<long>(static_cast<double>(arg)+(signbit(arg) ? -0.5 : 0.5)); });
unary_test("nearbyint", [](half arg) { return !isfinite(arg) || compz(nearbyint(arg), half_cast<half>(half_cast<int>(arg))); });
unary_test("rint", [](half arg) { return !isfinite(arg) || compz(rint(arg), half_cast<half>(half_cast<int>(arg))); });
unary_test("lrint", [](half arg) { return !isfinite(arg) || lrint(arg) == half_cast<long>(arg); });
#if HALF_ENABLE_CPP11_LONG_LONG
unary_test("llround", [](half arg) { return !isfinite(arg) || llround(arg) ==
static_cast<long long>(static_cast<double>(arg)+(signbit(arg) ? -0.5 : 0.5)); });
unary_test("llrint", [](half arg) { return !isfinite(arg) || llrint(arg) == half_cast<long long>(arg); });
#endif
//test float functions
unary_test("frexp", [](half arg) -> bool { int eh, ef; bool eq = comp(frexp(arg, &eh),
static_cast<half>(std::frexp(static_cast<double>(arg), &ef))); return eq && (!isfinite(arg) || eh==ef); });
unary_test("ldexp", [](half arg) -> bool { unsigned int passed = 0; for(int i=-50; i<50; ++i) passed +=
comp(ldexp(arg, i), static_cast<half>(std::ldexp(static_cast<double>(arg), i))); return passed==100; });
unary_test("modf", [](half arg) -> bool { half h; double f; return comp(modf(arg, &h), static_cast<half>(
std::modf(static_cast<double>(arg), &f))) && comp(h, static_cast<half>(f)); });
binary_test("nextafter", [](half a, half b) -> bool { half c = nextafter(a, b); std::int16_t d = std::abs(
static_cast<std::int16_t>(h2b(a)-h2b(c))); return ((isnan(a) || isnan(b)) && isnan(c)) ||
(compz(a, b) && compz(b, c)) || ((d==1||d==0x7FFF) && (a<b)==(a<c)); });
binary_test("nexttoward", [](half a, half b) -> bool { half c = nexttoward(a, static_cast<long double>(b)); std::int16_t d = std::abs(
static_cast<std::int16_t>(h2b(a)-h2b(c))); return ((isnan(a) || isnan(b)) && isnan(c)) ||
(compz(a, b) && compz(b, c)) || ((d==1||d==0x7FFF) && (a<b)==(a<c)); });
binary_test("copysign", [](half a, half b) -> bool { half h = copysign(a, b); return comp(abs(h), abs(a)) && signbit(h)==signbit(b); });
#if HALF_ENABLE_CPP11_CMATH
//test basic functions
binary_test("remainder", [](half x, half y) { return comp(remainder(x, y), half_cast<half>(std::remainder(half_cast<double>(x), half_cast<double>(y)))); });
binary_test("remquo", [](half a, half b) -> bool { int qh = 0, qf = 0; return comp(remquo(a, b, &qh),
half_cast<half>(std::remquo(static_cast<double>(a), static_cast<double>(b), &qf))) && (qh&7)==(qf&7); });
binary_test("fmin", [](half x, half y) { return comp(fmin(x, y), half_cast<half>(std::fmin(half_cast<double>(x), half_cast<double>(y)))); });
binary_test("fmax", [](half x, half y) { return comp(fmax(x, y), half_cast<half>(std::fmax(half_cast<double>(x), half_cast<double>(y)))); });
binary_test("fdim", [](half x, half y) { return comp(fdim(x, y), half_cast<half>(std::fdim(half_cast<double>(x), half_cast<double>(y)))); });
ternary_test("fma", [](half x, half y, half z) { return comp(fma(x, y, z), half_cast<half>(std::fma(half_cast<double>(x), half_cast<double>(y), half_cast<double>(z)))); });
//test round functions
unary_test("trunc", [](half arg) { return comp(trunc(arg), half_cast<half>(std::trunc(half_cast<double>(arg)))); });
unary_test("round", [](half arg) { return comp(round(arg), half_cast<half>(std::round(half_cast<double>(arg)))); });
unary_test("lround", [](half arg) { return !isfinite(arg) || lround(arg) == std::lround(static_cast<double>(arg)); });
unary_test("llround", [](half arg) { return !isfinite(arg) || llround(arg) == std::llround(static_cast<double>(arg)); });
#if HALF_ROUND_STYLE == 1
unary_test("nearbyint", [](half arg) { return comp(nearbyint(arg), half_cast<half>(std::nearbyint(half_cast<double>(arg)))); });
unary_test("rint", [](half arg) { return comp(rint(arg), half_cast<half>(std::rint(half_cast<double>(arg)))); });
unary_test("lrint", [](half arg) { return !isfinite(arg) || half_float::lrint(arg) == std::lrint(static_cast<double>(arg)); });
unary_test("llrint", [](half arg) { return !isfinite(arg) || llrint(arg) == std::llrint(static_cast<double>(arg)); });
#endif
//test float functions
unary_test("scalbn", [](half arg) -> bool { unsigned int passed = 0; for(int i=-50; i<50; ++i) passed +=
comp(scalbn(arg, i), static_cast<half>(std::scalbn(static_cast<double>(arg), i))); return passed==100; });
unary_test("scalbln", [](half arg) -> bool { unsigned int passed = 0; for(long i=-50; i<50; ++i) passed +=
comp(scalbln(arg, i), static_cast<half>(std::scalbln(static_cast<double>(arg), i))); return passed==100; });
unary_test("ilogb", [](half arg) { return ilogb(arg) == std::ilogb(static_cast<double>(arg)); });
unary_test("logb", [](half arg) { return comp(logb(arg), static_cast<half>(std::logb(static_cast<double>(arg)))); });
binary_test("copysign", [](half a, half b) { return comp(copysign(a, b),
static_cast<half>(std::copysign(static_cast<double>(a), static_cast<double>(b)))); });
//test classification functions
unary_test("fpclassify", [](half arg) -> bool { int ch=fpclassify(arg), cf=std::fpclassify(
static_cast<double>(arg)); return ch==cf || (ch==FP_SUBNORMAL && cf==FP_NORMAL); });
unary_test("isfinite", [](half arg) { return isfinite(arg) == std::isfinite(static_cast<double>(arg)); });
unary_test("isinf", [](half arg) { return isinf(arg) == std::isinf(static_cast<double>(arg)); });
unary_test("isnan", [](half arg) { return isnan(arg) == std::isnan(static_cast<double>(arg)); });
unary_test("isnormal", [](half arg) { return isnormal(arg) == std::isnormal(static_cast<double>(arg)) ||
(!isnormal(arg) && fpclassify(arg)==FP_SUBNORMAL); });
unary_test("signbit", [](half arg) { return signbit(arg) == std::signbit(static_cast<double>(arg)); });
//test comparison functions
binary_test("isgreater", [](half a, half b) { return isgreater(a, b) == std::isgreater(static_cast<double>(a), static_cast<double>(b)); });
binary_test("isgreaterequal", [](half a, half b) { return isgreaterequal(a, b) == std::isgreaterequal(static_cast<double>(a), static_cast<double>(b)); });
binary_test("isless", [](half a, half b) { return isless(a, b) == std::isless(static_cast<double>(a), static_cast<double>(b)); });
binary_test("islessequal", [](half a, half b) { return islessequal(a, b) == std::islessequal(static_cast<double>(a), static_cast<double>(b)); });
binary_test("islessgreater", [](half a, half b) { return islessgreater(a, b) == std::islessgreater(static_cast<double>(a), static_cast<double>(b)); });
binary_test("isunordered", [](half a, half b) { return isunordered(a, b) == std::isunordered(static_cast<double>(a), static_cast<double>(b)); });
#endif
//test rounding
float_test("round_to_nearest", [](float f) -> bool { half a = half_cast<half,std::round_indeterminate>(f),
b(nextafter(a, copysign(std::numeric_limits<half>::infinity(), a))), h = half_cast<half,std::round_to_nearest>(f);
float af(a), bf(b), hf(h); return half_float::detail::builtin_isnan(f) || (std::abs(hf)>std::abs(f)&&comp(h, b)&&((std::abs(f-af)>std::abs(bf-f) ||
(std::abs(f-af)==std::abs(bf-f)&&!(h2b(h)&1)))||isinf(h))) || (std::abs(hf)<=std::abs(f)&&comp(h, a)&&((std::abs(f-af)<std::abs(bf-f) ||
(std::abs(f-af)==std::abs(bf-f)&&!(h2b(h)&1)))||isinf(h))); });
float_test("round_toward_zero", [](float f) -> bool { half a = half_cast<half,std::round_indeterminate>(f),
h = half_cast<half,std::round_toward_zero>(f); float af(a), hf(h); return half_float::detail::builtin_isnan(f) || isinf(a) || af == hf; });
float_test("round_toward_infinity", [](float f) -> bool { half a = half_cast<half,std::round_toward_zero>(f),
b(nextafter(a, copysign(std::numeric_limits<half>::infinity(), a))), h = half_cast<half,std::round_toward_infinity>(f);
float hf(h); return half_float::detail::builtin_isnan(f) || (comp(h, a)&&(signbit(h)||hf==f)) || (comp(h, b)&&!signbit(h)&&hf>f); });
float_test("round_toward_neg_infinity", [](float f) -> bool { half a = half_cast<half,std::round_toward_zero>(f),
b(nextafter(a, copysign(std::numeric_limits<half>::infinity(), a))), h = half_cast<half,std::round_toward_neg_infinity>(f);
float hf(h); return half_float::detail::builtin_isnan(f) || (comp(h, a)&&(!signbit(h)||hf==f)) || (comp(h, b)&&signbit(h)&&hf<f); });
//test float casting
auto rand23 = std::bind(std::uniform_int_distribution<std::uint32_t>(0, (1<<23)-1), std::default_random_engine());
unary_test("half_cast<float>", [](half arg) -> bool { float a = half_cast<float>(arg), b = static_cast<float>(arg);
return *reinterpret_cast<std::uint32_t*>(&a) == *reinterpret_cast<std::uint32_t*>(&b); });
unary_test("half_cast<round_to_nearest>(float)", [&rand23](half arg) -> bool { float f = half_cast<float>(arg);
std::uint32_t n=rand23(), m=1<<13; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint32_t*>(&f) |= n&(m-1)&-isfinite(arg); return fpclassify(arg)==FP_ZERO ||
comp(half_cast<half,std::round_to_nearest>(f), ((n&(m>>1)) && ((n&((m>>1)-1)) || (h2b(arg)&1)))
? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
unary_test("half_cast<round_toward_zero>(float)", [&rand23](half arg) -> bool { float f = half_cast<float>(arg);
std::uint32_t n=rand23(), m=1<<13; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint32_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_zero>(f), arg); });
unary_test("half_cast<round_toward_infinity>(float)", [&rand23](half arg) -> bool { float f = half_cast<float>(arg);
std::uint32_t n=rand23(), m=1<<13; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint32_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_infinity>(f),
(!signbit(arg)&&(n&(m-1))) ? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
unary_test("half_cast<round_toward_neg_infinity>(float)", [&rand23](half arg) -> bool { float f = half_cast<float>(arg);
std::uint32_t n=rand23(), m=1<<13; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint32_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_neg_infinity>(f),
(signbit(arg)&&(n&(m-1))) ? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
//test double casting
auto rand52 = std::bind(std::uniform_int_distribution<std::uint64_t>(0, (1ULL<<52)-1), std::default_random_engine());
unary_test("half_cast<double>", [](half arg) -> bool { double a = half_cast<double>(arg), b = static_cast<float>(arg);
return isnan(arg) || *reinterpret_cast<std::uint64_t*>(&a) == *reinterpret_cast<std::uint64_t*>(&b); });
unary_test("half_cast<round_to_nearest>(double)", [&rand52](half arg) -> bool { double f = half_cast<double>(arg);
std::uint64_t n=rand52(), m=1ULL<<42; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint64_t*>(&f) |= n&(m-1)&-isfinite(arg); return fpclassify(arg)==FP_ZERO ||
comp(half_cast<half,std::round_to_nearest>(f), ((n&(m>>1)) && ((n&((m>>1)-1)) || (h2b(arg)&1)))
? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
unary_test("half_cast<round_toward_zero>(double)", [&rand52](half arg) -> bool { double f = half_cast<double>(arg);
std::uint64_t n=rand52(), m=1ULL<<42; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint64_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_zero>(f), arg); });
unary_test("half_cast<round_toward_infinity>(double)", [&rand52](half arg) -> bool { double f = half_cast<double>(arg);
std::uint64_t n=rand52(), m=1ULL<<42; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint64_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_infinity>(f),
(!signbit(arg)&&(n&(m-1))) ? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
unary_test("half_cast<round_toward_neg_infinity>(double)", [&rand52](half arg) -> bool { double f = half_cast<double>(arg);
std::uint64_t n=rand52(), m=1ULL<<42; if(fpclassify(arg)==FP_SUBNORMAL) m <<= std::min(std::max(-ilogb(arg)-14, 0), 10);
*reinterpret_cast<std::uint64_t*>(&f) |= n&(m-1)&-isfinite(arg); return comp(half_cast<half,std::round_toward_neg_infinity>(f),
(signbit(arg)&&(n&(m-1))) ? nextafter(arg, copysign(std::numeric_limits<half>::infinity(), arg)) : arg); });
//test casting to int
#if HALF_ENABLE_CPP11_CMATH
unary_test("half_cast<int>", [](half arg) -> bool { return !isfinite(arg) || half_cast<int>(arg) == static_cast<int>(nearbyint(arg)); });
#endif
unary_test("half_cast<int,round_to_nearest>", [](half arg) -> bool { float fi, ff = std::abs(std::modf(static_cast<float>(arg), &fi));
int i = static_cast<int>(fi); i += (-2*signbit(arg)+1) * (ff>0.5f || (ff==0.5f && i&1));
return !isfinite(arg) || half_cast<int,std::round_to_nearest>(arg) == i;
});
unary_test("half_cast<int,round_toward_zero>", [](half arg) -> bool { return !isfinite(arg) || half_cast<int,std::round_toward_zero>(arg) == static_cast<int>(arg); });
unary_test("half_cast<int,round_toward_infinity>", [](half arg) -> bool { float fi, ff = std::modf(static_cast<float>(arg), &fi);
return !isfinite(arg) || half_cast<int,std::round_toward_infinity>(arg) == (static_cast<int>(fi)+(ff>0.0f)); });
unary_test("half_cast<int,round_toward_neg_infinity>", [](half arg) -> bool { float fi, ff = std::modf(static_cast<float>(arg), &fi);
return !isfinite(arg) || half_cast<int,std::round_toward_neg_infinity>(arg) == (static_cast<int>(fi)-(ff<0.0f)); });
//test casting from int
int_test("half_cast<>(int)", [](int i) -> bool { return comp(half_cast<half>(i), half_cast<half>(static_cast<float>(i))); });
int_test("half_cast<round_to_nearest>(int)", [](int i) -> bool {
return comp(half_cast<half,std::round_to_nearest>(i), half_cast<half,std::round_to_nearest>(static_cast<float>(i))); });
int_test("half_cast<round_toward_zero>(int)", [](int i) -> bool {
return comp(half_cast<half,std::round_toward_zero>(i), half_cast<half,std::round_toward_zero>(static_cast<float>(i))); });
int_test("half_cast<round_toward_infinity>(int)", [](int i) -> bool {
return comp(half_cast<half,std::round_toward_infinity>(i), half_cast<half,std::round_toward_infinity>(static_cast<float>(i))); });
int_test("half_cast<round_toward_neg_infinity>(int)", [](int i) -> bool {
return comp(half_cast<half,std::round_toward_neg_infinity>(i), half_cast<half,std::round_toward_neg_infinity>(static_cast<float>(i))); });
//test numeric limits
unary_test("numeric_limits::min", [](half arg) { return !isnormal(arg) || signbit(arg) || arg>=std::numeric_limits<half>::min(); });
unary_test("numeric_limits::lowest", [](half arg) { return !isfinite(arg) || arg>=std::numeric_limits<half>::lowest(); });
unary_test("numeric_limits::max", [](half arg) { return !isfinite(arg) || arg<=std::numeric_limits<half>::max(); });
unary_test("numeric_limits::denorm_min", [](half arg) { return !isfinite(arg) ||
signbit(arg) || arg==static_cast<half>(0.0f) || arg>=std::numeric_limits<half>::denorm_min(); });
simple_test("numeric_limits::infinity", []() { return isinf(std::numeric_limits<half>::infinity()) &&
!signbit(std::numeric_limits<half>::infinity()); });
simple_test("numeric_limits::quiet_NaN", []() { return isnan(std::numeric_limits<half>::quiet_NaN()); });
simple_test("numeric_limits::signaling_NaN", []() { return isnan(std::numeric_limits<half>::signaling_NaN()); });
simple_test("numeric_limits::epsilon", []() { return nextafter(static_cast<half>(1.0f),
std::numeric_limits<half>::infinity())-static_cast<half>(1.0f) == std::numeric_limits<half>::epsilon(); });
binary_test("numeric_limits::round_error", [](half a, half b) -> bool { double c = static_cast<double>(a) +
static_cast<double>(b); return !isfinite(a) || !isfinite(b) || c>static_cast<double>(std::numeric_limits<half>::max()) ||
c<static_cast<double>(std::numeric_limits<half>::lowest()) || std::abs(c-static_cast<double>(
static_cast<half>(c)))<=std::ldexp(static_cast<double>(std::numeric_limits<half>::round_error()),
ilogb(static_cast<half>(c))-std::numeric_limits<half>::digits+1); });
#if HALF_ENABLE_CPP11_HASH
//test hash
binary_test("hash function", [](half a, half b) { return a != b || std::hash<half>()(a) == std::hash<half>()(b); });
struct { bool operator()(half a, half b) const { return h2b(a) == h2b(b); } } bincomp;
std::unordered_map<half,const half*,std::hash<half>,decltype(bincomp)> map(65536, std::hash<half>(), bincomp);
unary_test("hash insert", [&map](const half &arg) { return map.insert(std::make_pair(arg, &arg)).second; });
unary_test("hash retrieve", [&map](const half &arg) { return map[arg] == &arg; });
#endif
#if HALF_ENABLE_CPP11_USER_LITERALS
//test literals
simple_test("literals", []() -> bool { using namespace half_float::literal; return comp(0.0_h, half(0.0f)) && comp(-1.0_h, half(-1.0f)) &&
comp(+3.14159265359_h, half(3.14159265359f)) && comp(1e-2_h, half(1e-2f)) && comp(-4.2e3_h, half(-4.2e3f)); });
#endif
*/
if (failed_.empty())
log_ << "all tests passed\n";
else
{
log_ << (failed_.size()) << " OF " << tests_ << " FAILED:\n ";
std::copy(failed_.begin(), failed_.end(), std::ostream_iterator<std::string>(log_, "\n "));
log_ << '\n';
}
return failed_.size();
}
void performance_test()
{
std::vector<half> finite, positive, one2one, one2inf, neg2inf;
for (std::uint16_t u = 0; u < 0x7C00; ++u)
{
finite.push_back(b2h(u));
finite.push_back(b2h(u | 0x8000));
positive.push_back(b2h(u));
neg2inf.push_back(b2h(u));
if (u <= 0x3C00)
{
one2one.push_back(b2h(u));
one2one.push_back(b2h(u | 0x8000));
neg2inf.push_back(b2h(u | 0x8000));
}
else
one2inf.push_back(b2h(u));
}
std::vector<half> xs(finite), ys(finite), zs(finite), results(finite.size());
std::default_random_engine g;
std::shuffle(finite.begin(), finite.end(), g);
std::shuffle(positive.begin(), positive.end(), g);
std::shuffle(one2one.begin(), one2one.end(), g);
std::shuffle(one2inf.begin(), one2inf.end(), g);
std::shuffle(neg2inf.begin(), neg2inf.end(), g);
std::shuffle(xs.begin(), xs.end(), g);
std::shuffle(ys.begin(), ys.end(), g);
std::shuffle(zs.begin(), zs.end(), g);
/*
OPERATOR_PERFORMANCE_TEST(+, xs, ys, 4);
OPERATOR_PERFORMANCE_TEST(-, xs, ys, 4);
OPERATOR_PERFORMANCE_TEST(*, xs, ys, 4);
OPERATOR_PERFORMANCE_TEST(/, xs, ys, 4);
BINARY_PERFORMANCE_TEST(fdim, xs, ys, 8);
TERNARY_PERFORMANCE_TEST(fma, xs, ys, zs, 64);
UNARY_PERFORMANCE_TEST(exp, finite, 1000);
UNARY_PERFORMANCE_TEST(exp2, finite, 1000);
UNARY_PERFORMANCE_TEST(expm1, finite, 1000);
UNARY_PERFORMANCE_TEST(log, positive, 1000);
UNARY_PERFORMANCE_TEST(log10, positive, 1000);
UNARY_PERFORMANCE_TEST(log1p, neg2inf, 1000);
UNARY_PERFORMANCE_TEST(log2, positive, 1000);
UNARY_PERFORMANCE_TEST(sqrt, positive, 1000);
UNARY_PERFORMANCE_TEST(cbrt, finite, 1000);
BINARY_PERFORMANCE_TEST(pow, xs, ys, 8);
BINARY_PERFORMANCE_TEST(hypot, xs, ys, 8);
UNARY_PERFORMANCE_TEST(sin, finite, 1000);
UNARY_PERFORMANCE_TEST(cos, finite, 1000);
UNARY_PERFORMANCE_TEST(tan, finite, 1000);
UNARY_PERFORMANCE_TEST(asin, one2one, 1000);
UNARY_PERFORMANCE_TEST(acos, one2one, 1000);
UNARY_PERFORMANCE_TEST(atan, finite, 1000);
BINARY_PERFORMANCE_TEST(atan2, xs, ys, 8);
UNARY_PERFORMANCE_TEST(sinh, finite, 1000);
UNARY_PERFORMANCE_TEST(cosh, finite, 1000);
UNARY_PERFORMANCE_TEST(tanh, finite, 1000);
UNARY_PERFORMANCE_TEST(asinh, finite, 1000);
UNARY_PERFORMANCE_TEST(acosh, one2inf, 1000);
UNARY_PERFORMANCE_TEST(atanh, one2one, 1000);
UNARY_PERFORMANCE_TEST(erf, finite, 1000);
UNARY_PERFORMANCE_TEST(erfc, finite, 1000);
UNARY_PERFORMANCE_TEST(lgamma, finite, 1000);
UNARY_PERFORMANCE_TEST(tgamma, finite, 1000);
*/
}
private:
typedef std::vector<half> half_vector;
typedef std::map<std::string, half_vector> test_map;
typedef std::map<std::string, int> class_map;
template<typename F> bool class_test(const std::string &name, F &&test)
{
unsigned int count = 0;
log_ << "testing " << name << ":\n";
for (auto iterB = halfs_.begin(); iterB != halfs_.end(); ++iterB)
{
unsigned int passed = 0;
int fpclass = classes_[iterB->first];
for (auto iterH = iterB->second.begin(); iterH != iterB->second.end(); ++iterH)
passed += test(*iterH, fpclass);
log_ << " " << iterB->first << ": ";
if (passed == iterB->second.size())
{
log_ << "all passed\n";
++count;
}
else
log_ << (iterB->second.size() - passed) << " of " << iterB->second.size() << " FAILED\n";
}
log_ << '\n';
++tests_;
if (count == halfs_.size())
return true;
failed_.push_back(name);
return false;
}
template<typename F> bool simple_test(const std::string &name, F &&test)
{
log_ << "testing " << name << ": ";
bool passed = test();
log_ << (passed ? "passed" : "FAILED") << "\n\n";
++tests_;
if (!passed)
failed_.push_back(name);
return passed;
}
template<typename F> bool unary_test(const std::string &name, F &&test)
{
unsigned int count = 0, failed = 0;
log_ << "testing " << name << ":\n";
for (auto iterB = halfs_.begin(); iterB != halfs_.end(); ++iterB)
{
unsigned int passed = 0;
for (auto iterH = iterB->second.begin(); iterH != iterB->second.end(); ++iterH)
passed += test(*iterH);
log_ << " " << iterB->first << ": ";
if (passed == iterB->second.size())
{
log_ << "all passed\n";
++count;
}
else
{
failed += iterB->second.size() - passed;
log_ << (iterB->second.size() - passed) << " of " << iterB->second.size() << " FAILED\n";
}
}
if (csv_)
*csv_ << name << ";" << failed << '\n';
if (failed)
log_ << failed << " FAILED\n\n";
else
log_ << '\n';
++tests_;
if (count == halfs_.size())
return true;
failed_.push_back(name);
return false;
}
template<typename F> bool binary_test(const std::string &name, F &&test)
{
unsigned long tests = 0, count = 0, step = fast_ ? 64 : 1;
auto rand = std::bind(std::uniform_int_distribution<std::uint16_t>(0, step - 1), std::default_random_engine());
std::set<std::string> failed_tests;
log_ << "testing " << name << (fast_ ? ": " : ":\n");
for (auto iterB1 = halfs_.begin(); iterB1 != halfs_.end(); ++iterB1)
{
unsigned int end1 = /*(iterB1->first.find("NaN")==std::string::npos) ?*/ iterB1->second.size() /*: 1*/;
for (auto iterB2 = halfs_.begin(); iterB2 != halfs_.end(); ++iterB2)
{
if (!fast_)
std::cout << iterB1->first << " x " << iterB2->first;
bool failed = false;
unsigned int end2 = /*(iterB2->first.find("NaN")==std::string::npos) ?*/ iterB2->second.size() /*: 1*/;
for (unsigned int i = 0; i < end1; i += step)
{
half a = iterB1->second[i];
if (fast_ && end1 >= step)
a = b2h(h2b(a) | rand());
for (unsigned int j = 0; j < end2; j += step)
{
half b = iterB2->second[j];
if (fast_ && end2 >= step)
b = b2h(h2b(b) | rand());
bool success = test(a, b);
count += success;
failed = failed || !success;
++tests;
}
}
if (!fast_)
std::cout << " done\n";
if (failed)
failed_tests.insert(iterB1->first + " x " + iterB2->first);
}
}
bool passed = count == tests;
if (csv_)
*csv_ << name << ";" << (tests - count) << '\n';
if (passed)
log_ << "all passed\n\n";
else
{
log_ << (tests - count) << " of " << tests << " FAILED\n";
for (auto &&s : failed_tests)
log_ << s << " FAILED\n";
log_ << '\n';
failed_.push_back(name);
}
++tests_;
return passed;
}
template<typename F> bool ternary_test(const std::string &name, F &&test)
{
unsigned int tests = 0, count = 0, step = fast_ ? 256 : 1;
auto rand = std::bind(std::uniform_int_distribution<std::uint16_t>(0, step - 1), std::default_random_engine());
std::set<std::string> failed_tests;
log_ << "testing " << name << ": ";
for (auto iterB1 = halfs_.begin(); iterB1 != halfs_.end(); ++iterB1)
{
unsigned int end1 = /*(iterB1->first.find("NaN")==std::string::npos) ?*/ iterB1->second.size() /*: 1*/;
for (auto iterB2 = halfs_.begin(); iterB2 != halfs_.end(); ++iterB2)
{
unsigned int end2 = /*(iterB2->first.find("NaN")==std::string::npos) ?*/ iterB2->second.size() /*: 1*/;
for (auto iterB3 = halfs_.begin(); iterB3 != halfs_.end(); ++iterB3)
{
bool failed = false;
unsigned int end3 = /*(iterB3->first.find("NaN")==std::string::npos) ?*/ iterB3->second.size() /*: 1*/;
for (unsigned int i = 0; i < end1; i += step)
{
half a = iterB1->second[i];
if (fast_ && end1 >= step)
a = b2h(h2b(a) | rand());
for (unsigned int j = 0; j < end2; j += step)
{
half b = iterB2->second[j];
if (fast_ && end2 >= step)
b = b2h(h2b(b) | rand());
for (unsigned int k = 0; k < end3; k += step)
{
half c = iterB3->second[k];
if (fast_ && end3 >= step)
c = b2h(h2b(c) | rand());
bool success = test(a, b, c);
count += success;
failed = failed || !success;
++tests;
}
}
}
if (failed)
failed_tests.insert(iterB1->first + " x " + iterB2->first + " x " + iterB3->first);
}
}
}
bool passed = count == tests;
if (csv_)
*csv_ << name << ";" << (tests - count) << '\n';
if (passed)
log_ << "all passed\n\n";
else
{
log_ << (tests - count) << " of " << tests << " failed\n\n";
for (auto &&s : failed_tests)
log_ << s << " FAILED\n";
log_ << '\n';
failed_.push_back(name);
}
++tests_;
return passed;
}
template<typename F> bool float_test(const std::string &name, F &&test)
{
auto rand32 = std::bind(std::uniform_int_distribution<std::uint32_t>(0, std::numeric_limits<std::uint32_t>::max()), std::default_random_engine());
unsigned long long count = 0, tests = fast_ ? 1e6 : (1ULL << 32);
log_ << "testing " << name << ": ";
if (fast_)
{
for (unsigned long long i = 0; i < tests; ++i)
{
std::uint32_t u = rand32();
count += test(*reinterpret_cast<float*>(&u));
}
}
else
for (std::uint32_t i = 0; i++ > 0; )
count += test(*reinterpret_cast<float*>(&i));
bool passed = count == tests;
if (passed)
log_ << "all passed\n\n";
else
{
log_ << (tests - count) << " of " << tests << " FAILED\n\n";
failed_.push_back(name);
}
++tests_;
return passed;
}
template<typename F> bool int_test(const std::string &name, F &&test)
{
unsigned int count = 0, tests = (1 << 17) + 1;
log_ << "testing " << name << ": ";
for (int i = -(1 << 16); i <= (1 << 16); ++i)
count += test(i);
bool passed = count == tests;
if (passed)
log_ << "all passed\n\n";
else
{
log_ << (tests - count) << " of " << tests << " FAILED\n\n";
failed_.push_back(name);
}
++tests_;
return passed;
}
template<typename F> bool unary_reference_test(const std::string &name, F &&fn)
{
std::vector<std::pair<half, half>> reference(std::numeric_limits<std::uint16_t>::max() + 1);
std::ifstream in("reference/" + name, std::ios_base::in | std::ios_base::binary);
if (!in)
throw std::runtime_error("cannot open reference file for " + name);
in.read(reinterpret_cast<char*>(reference.data()), reference.size() * sizeof(reference.front()));
double err = 0.0, rel = 0.0; int bin = 0;
bool success = unary_test(name, [&, this](half arg) -> bool {
auto ref = reference[h2b(arg)];
half a = fn(arg), b = select<std::numeric_limits<half>::round_style>(ref);
bool equal = (rough_ || std::numeric_limits<half>::round_style == std::round_indeterminate) ? (comp(a, ref.first) || comp(a, ref.second)) : comp(a, b);
if (!equal)
{
double error = std::abs(static_cast<double>(a) - static_cast<double>(b));
// if(std::abs(h2b(a)-h2b(b)) > 1)
// if(std::isinf(error/std::abs(b)))
// std::cerr << arg << '(' << std::hex << h2b(arg) << ") = " << a << '(' << std::hex << h2b(a) << "), " << b << '(' << h2b(b) << ") -> " << error << '\n' << std::dec;
err = std::max(err, error); rel = std::max(rel, error / std::abs(b)); bin = std::max(bin, std::abs(h2b(a) - h2b(b)));
}
return equal;
});
if (err != 0.0 || rel != 0.0)
std::cout << name << " max error: " << err << ", max relative error: " << rel << ", max ulp error: " << /*ilog2*/(bin) << '\n';
return success;
}
template<typename F> bool binary_reference_test(const std::string &name, F &&fn)
{
struct record { half x, y; std::pair<half, half> result; };
std::ifstream in("reference/" + name, std::ios_base::in | std::ios_base::binary | std::ios_base::ate);
if (!in)
throw std::runtime_error("cannot open reference file for " + name);
unsigned int passed = 0, count = in.tellg() / sizeof(record);
std::vector<record> reference(count);
in.seekg(0, std::ios_base::beg);
in.clear();
in.read(reinterpret_cast<char*>(reference.data()), reference.size() * sizeof(reference.front()));
double err = 0.0, rel = 0.0; int bin = 0;
bool success = simple_test(name, [&, this]() -> bool {
for (unsigned int i = 0; i < count; ++i)
{
auto ref = reference[i];
half x = ref.x, y = ref.y, a = fn(x, y), b = select<std::numeric_limits<half>::round_style>(ref.result);
bool equal = (rough_ || std::numeric_limits<half>::round_style == std::round_indeterminate) ? (comp(a, ref.result.first) || comp(a, ref.result.second)) : comp(a, b);
if (!equal)
{
double error = std::abs(static_cast<double>(a) - static_cast<double>(b));
// if(std::abs(h2b(a)-h2b(b)) > 1)
// std::cerr << x << ", " << y << " = " << a << '(' << std::hex << h2b(a) << "), " << b << '(' << h2b(b) << ") -> " << error << '\n' << std::dec;
err = std::max(err, error); rel = std::max(rel, error / std::abs(b)); bin = std::max(bin, std::abs(h2b(a) - h2b(b)));
}
passed += equal;
}
if (csv_)
*csv_ << name << ";" << (count - passed) << '\n';
return passed == count;
});
if (passed != count)
std::cout << name << ": " << (count - passed) << " of " << count << " failed\n";
if (err != 0.0 || rel != 0.0)
std::cout << name << " max error: " << err << ", max relative error: " << rel << ", max ulp error: " << /*ilog2*/(bin) << '\n';
return success;
}
template<typename F> bool ternary_reference_test(const std::string &name, F &&fn)
{
struct record { half x, y, z; std::pair<half, half> result; };
std::ifstream in("reference/" + name, std::ios_base::in | std::ios_base::binary | std::ios_base::ate);
if (!in)
throw std::runtime_error("cannot open reference file for " + name);
unsigned int passed = 0, count = in.tellg() / sizeof(record);
std::vector<record> reference(count);
in.seekg(0, std::ios_base::beg);
in.clear();
in.read(reinterpret_cast<char*>(reference.data()), reference.size() * sizeof(reference.front()));
double err = 0.0, rel = 0.0; int bin = 0;
bool success = simple_test(name, [&, this]() -> bool {
for (unsigned int i = 0; i < count; ++i)
{
auto ref = reference[i];
half x = ref.x, y = ref.y, z = ref.z, a = fn(x, y, z), b = select<std::numeric_limits<half>::round_style>(ref.result);
bool equal = (rough_ || std::numeric_limits<half>::round_style == std::round_indeterminate) ? (comp(a, ref.result.first) || comp(a, ref.result.second)) : comp(a, b);
if (!equal)
{
double error = std::abs(static_cast<double>(a) - static_cast<double>(b));
// std::cerr << x << ", " << y << ", " << z << " = " << a << '(' << std::hex << h2b(a) << "), " << b << '(' << h2b(b) << ") -> " << error << '\n' << std::dec;
err = std::max(err, error); rel = std::max(rel, error / std::abs(b)); bin = std::max(bin, std::abs(h2b(a) - h2b(b)));
}
passed += equal;
}
if (csv_)
*csv_ << name << ";" << (count - passed) << '\n';
return passed == count;
});
if (passed != count)
std::cout << name << ": " << (count - passed) << " of " << count << " failed\n";
if (err != 0.0 || rel != 0.0)
std::cout << name << " max error: " << err << ", max relative error: " << rel << ", max ulp error: " << /*ilog2*/(bin) << '\n';
return success;
}
test_map halfs_;
class_map classes_;
unsigned int tests_;
std::vector<std::string> failed_;
std::ostream &log_;
std::ostream *csv_;
bool fast_;
bool rough_;
};
struct timer
{
timer() : start_(std::chrono::high_resolution_clock::now()) {}
~timer() { std::cout << "time: " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start_).count() << " ms\n"; }
private:
std::chrono::time_point<std::chrono::high_resolution_clock> start_;
};
int main(int argc, char *argv[]) try
{
#ifndef HALF_ARITHMETIC_TYPE
switch (std::numeric_limits<half>::round_style)
{
#ifdef _WIN32
case std::round_to_nearest: _controlfp(_MCW_RC, _RC_NEAR); break;
case std::round_toward_zero: _controlfp(_MCW_RC, _RC_CHOP); break;
case std::round_toward_infinity: _controlfp(_MCW_RC, _RC_UP); break;
case std::round_toward_neg_infinity: _controlfp(_MCW_RC, _RC_DOWN); break;
#else
case std::round_to_nearest: std::fesetround(FE_TONEAREST); break;
case std::round_toward_zero: std::fesetround(FE_TOWARDZERO); break;
case std::round_toward_infinity: std::fesetround(FE_UPWARD); break;
case std::round_toward_neg_infinity: std::fesetround(FE_DOWNWARD); break;
#endif
}
#endif
/*
auto rand_abs = std::bind(std::uniform_int_distribution<std::uint32_t>(0x00000000, 0x7F100000), std::default_random_engine());
auto rand_sign = std::bind(std::uniform_int_distribution<std::uint32_t>(0, 1), std::default_random_engine());
std::vector<float> floats;
for(unsigned int i=0; i<1e8; ++i)
{
auto bits = rand_abs() | (rand_sign()<<31);
floats.push_back(*reinterpret_cast<float*>(&bits));
}
std::shuffle(floats.begin(), floats.end(), std::default_random_engine());
std::vector<half> halfs(floats.size());
{
timer time;
for(std::size_t i=0; i<floats.size(); ++i)
halfs[i] = half_cast<half,std::round_to_nearest>(floats[i]);
}
return 0;
half pi = half_cast<half>(3.1415926535897932384626433832795l);
std::cout << "Pi: " << pi << " - 0x" << std::hex << std::setfill('0') << std::setw(4) << h2b(pi) << std::dec
<< " - " << std::bitset<16>(static_cast<unsigned long long>(h2b(pi))).to_string() << std::endl;
half e = half_cast<half>(2.7182818284590452353602874713527l);
std::cout << "e: " << e << " - 0x" << std::hex << std::setfill('0') << std::setw(4) << h2b(e) << std::dec
<< " - " << std::bitset<16>(static_cast<unsigned long long>(h2b(e))).to_string() << std::endl;
static const long double logs[] = {
1.0000000000000000000000000000000000000000000000000000000000000000000000000000L, 0.5849625007211561814537389439478165087598144076924810604557526545410982276485L,
0.3219280948873623478703194294893901758648313930245806120547563958159347765589L, 0.1699250014423123629074778878956330175196288153849621209115053090821964552970L,
0.0874628412503394082540660108104043540112672823448206881266090643866965081686L, 0.0443941193584534376531019906736094674630459333742491317685543002674288465967L,
0.0223678130284545082671320837460849094932677948156179815932199216587899627785L, 0.0112272554232541203378805844158839407281095943600297940811823651462712311786L,
0.0056245491938781069198591026740666017211096815383520359072957784732489771013L, 0.0028150156070540381547362547502839489729507927389771959487826944878598909400L,
0.0014081943928083889066101665016890524233311715793462235597709051792834906001L, 0.0007042690112466432585379340422201964456668872087249334581924550139514213168L,
0.0003521774803010272377989609925281744988670304302127133979341729842842377649L, 0.0001760994864425060348637509459678580940163670081839283659942864068257522373L,
0.0000880524301221769086378699983597183301490534085738474534831071719854721939L, 0.0000440268868273167176441087067175806394819146645511899503059774914593663365L,
0.0000220136113603404964890728830697555571275493801909791504158295359319433723L, 0.0000110068476674814423006223021573490183469930819844945565597452748333526464L,
0.0000055034343306486037230640321058826431606183125807276574241540303833251704L, 0.0000027517197895612831123023958331509538486493412831626219340570294203116559L,
0.0000013758605508411382010566802834037147561973553922354232704569052932922954L, 0.0000006879304394358496786728937442939160483304056131990916985043387874690617L,
0.0000003439652607217645360118314743718005315334062644619363447395987584138324L, 0.0000001719826406118446361936972479533123619972434705828085978955697643547921L,
0.0000000859913228686632156462565208266682841603921494181830811515318381744650L, 0.0000000429956620750168703982940244684787907148132725669106053076409624949917L,
0.0000000214978311976797556164155504126645192380395989504741781512309853438587L, 0.0000000107489156388827085092095702361647949603617203979413516082280717515504L,
0.0000000053744578294520620044408178949217773318785601260677517784797554422804L, 0.0000000026872289172287079490026152352638891824761667284401180026908031182361L,
0.0000000013436144592400232123622589569799954658536700992739887706412976115422L, 0.0000000006718072297764289157920422846078078155859484240808550018085324187007L };
std::ofstream out("logs.txt");
for(auto val : logs)
out << "0x" << std::hex << std::uppercase << std::setfill('0') << std::setw(8) << std::llrint(std::ldexp(val, 27)) << ", \n";
return 0;
using namespace half_float::literal;
std::cout << "0x" << std::hex << std::uppercase << std::setfill('0') << std::setw(8) << std::llrint(std::ldexp(0.6072529350088812561694l, 30)) << '\n';
std::ofstream out("atans.txt");
for(int i=0; i<32; ++i)
out << "0x" << std::hex << std::uppercase << std::setfill('0') << std::setw(8) << std::llrint(std::ldexp(std::atan(std::ldexp(1.0l, -i)), 30)) << ", \n";
return 0;
for(std::uint16_t i=0x3C00; i<0x7C00; ++i)
{
half x = b2h(i), y = half_cast<half,std::round_toward_neg_infinity>(std::erfc(half_cast<double>(x)));
std::cout << x << " (" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << i << std::dec << ")\t= " << y << '\n';
if(y == 0.0_h)
return 0;
}
std::cout << std::hex << std::uppercase << std::setfill('0') << std::setw(9) << std::llrint(std::ldexp(3.15l, 31-1)) << '\n';
std::cout << std::hex << std::uppercase << std::setfill('0') << std::setw(9) << std::llrint(std::ldexp(3.85l, 31-1)) << '\n';
std::cout << std::hex << std::uppercase << std::setfill('0') << std::setw(9) << std::llrint(std::ldexp(4.65l, 31-2)) << '\n';
return 0;
for(std::uint16_t i=0xBC00; i<0xFC00; ++i)
{
half x = b2h(i), y = half_cast<half, std::round_to_nearest>(std::exp2(half_cast<double>(x)));
std::cout << x << " (" << std::hex << std::uppercase << std::setfill('0') << std::setw(4) << i << std::dec << ")\t= " << y << '\n';
if(y == 0.0_h)
return 0;
}
*/
std::vector<std::string> args(argv + 1, argv + argc);
std::unique_ptr<std::ostream> file, csv;
bool fast = false, rough = false;
for (auto &&arg : args)
{
if (arg == "-fast")
fast = true;
else if (arg == "-rough")
rough = true;
else if (arg.length() > 4 && arg.substr(arg.length() - 4) == ".csv")
csv.reset(new std::ofstream(arg));
else
file.reset(new std::ofstream(arg));
}
half_test test(file ? *file : std::cout, csv.get(), fast, rough);
test.performance_test();
timer time;
return test.test();
}
catch (const std::exception &e)
{
std::cerr << "ERROR: " << e.what() << '\n';
return -1;
}
/*
* This implementation is extracted from PyTorch:
* Repo: github.com/pytorch/pytorch
* File: torch/lib/TH/THHalf.c
* Commit ID: 92481b59d31199df57420d4b14912348cc780d1d
* Functions are made "static inline" for performance
*/
/* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */
// Host functions for converting between FP32 and FP16 formats
static inline void TH_halfbits2float(unsigned short* src, float* res)
{
unsigned h = *src;
unsigned sign = ((h >> 15) & 1);
unsigned exponent = ((h >> 10) & 0x1f);
unsigned mantissa = ((h & 0x3ff) << 13);
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
}
else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
}
else {
exponent += 0x70;
}
*(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
}
static inline void TH_float2halfbits(float* src, unsigned short* dest)
{
unsigned x = *(unsigned*)src;
unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned sign, exponent, mantissa;
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
*dest = 0x7fffU;
return;
}
sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
*dest = sign | 0x7c00U;
return;
}
if (u < 0x33000001) {
*dest = (sign | 0x0000);
return;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
}
else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
*dest = (sign | (exponent << 10) | mantissa);
}
/*
* This implementation is extracted from Eigen:
* Repo: bitbucket.org/eigen/eigen
* File: Eigen/src/Core/arch/CUDA/Half.h
* Commit ID: 96e0f73a35de54f675d825bef5339b2f08e77eb4
*
* Removed a lot of redundant and cuda-specific code.
*/
#define EIGEN_STRONG_INLINE static inline
#define EIGEN_DEVICE_FUNC
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The conversion routines are Copyright (c) Fabian Giesen, 2016.
// The original license follows:
//
// Copyright (c) Fabian Giesen, 2016
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H
#define EIGEN_HALF_CUDA_H
namespace Eigen {
namespace half_impl {
// Make our own __half definition that is similar to CUDA's.
struct __half {
EIGEN_DEVICE_FUNC __half() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
unsigned short x;
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
__half h;
h.x = x;
return h;
}
union FP32 {
unsigned int u;
float f;
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return __float2half(ff);
#elif defined(EIGEN_HAS_FP16_C)
__half h;
h.x = _cvtss_sh(ff, 0);
return h;
#else
FP32 f; f.f = ff;
const FP32 f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
__half o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
f.u ^= sign;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
}
else { // (De)normalized number or zero
if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f.f += denorm_magic.f;
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
}
else {
unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1
f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
// rounding bias part 2
f.u += mant_odd;
// take the bits!
o.x = static_cast<unsigned short>(f.u >> 13);
}
}
o.x |= static_cast<unsigned short>(sign >> 16);
return o;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x);
#else
const FP32 magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) { // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
}
else if (exp == 0) { // Zero/Denormal?
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.x & 0x8000) << 16; // sign bit
return o.f;
#endif
}
} // end namespace half_impl
} // end namespace Eigen
#endif // EIGEN_HALF_CUDA_H
#pragma once
#include <stdint.h>
/*
* This code snippet posted by user Phernost on
* https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
*
* compress and decompress methods are made "inline" for performance
*/
class Float16Compressor
{
union Bits
{
float f;
int32_t si;
uint32_t ui;
};
static int const shift = 13;
static int const shiftSign = 16;
static int32_t const infN = 0x7F800000; // flt32 infinity
static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
static int32_t const signN = 0x80000000; // flt32 sign bit
static int32_t const infC = infN >> shift;
static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
static int32_t const maxC = maxN >> shift;
static int32_t const minC = minN >> shift;
static int32_t const signC = signN >> shiftSign; // flt16 sign bit
static int32_t const mulN = 0x52000000; // (1 << 23) / minN
static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
static int32_t const norC = 0x00400; // min flt32 normal down shifted
static int32_t const maxD = infC - maxC - 1;
static int32_t const minD = minC - subC - 1;
public:
inline static uint16_t compress(float value)
{
Bits v, s;
v.f = value;
uint32_t sign = v.si & signN;
v.si ^= sign;
sign >>= shiftSign; // logical shift
s.si = mulN;
s.si = s.f * v.f; // correct subnormals
v.si ^= (s.si ^ v.si) & -(minN > v.si);
v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
v.ui >>= shift; // logical shift
v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
return v.ui | sign;
}
inline static float decompress(uint16_t value)
{
Bits v;
v.ui = value;
int32_t sign = v.si & signC;
v.si ^= sign;
sign <<= shiftSign;
v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
Bits s;
s.si = mulC;
s.f *= v.si;
int32_t mask = -(norC > v.si);
v.si <<= shift;
v.si ^= (s.si ^ v.si) & mask;
v.si |= sign;
return v.f;
}
};
\ No newline at end of file
/*
* This implementation is extracted from numpy:
* Repo: github.com/numpy/numpy
* File: numpy/core/src/npymath/halffloat.c
* Commit ID: 25c23f1d956104a072a95355ffaa7a38b53710b7
* Functions are made "static inline" for performance, and
* non-conversion functions are removed, and generation of
* exceptions is disabled.
*/
#include <cstdint>
typedef uint16_t npy_uint16;
typedef uint32_t npy_uint32;
typedef uint64_t npy_uint64;
/*
* This chooses between 'ties to even' and 'ties away from zero'.
*/
#define NPY_HALF_ROUND_TIES_TO_EVEN 1
/*
* If these are 1, the conversions try to trigger underflow,
* overflow, and invalid exceptions in the FP system when needed.
*/
#define NPY_HALF_GENERATE_OVERFLOW 0
#define NPY_HALF_GENERATE_UNDERFLOW 0
#define NPY_HALF_GENERATE_INVALID 0
/*
********************************************************************
* BIT-LEVEL CONVERSIONS *
********************************************************************
*/
static inline npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
{
npy_uint32 f_exp, f_sig;
npy_uint16 h_sgn, h_exp, h_sig;
h_sgn = (npy_uint16)((f & 0x80000000u) >> 16);
f_exp = (f & 0x7f800000u);
/* Exponent overflow/NaN converts to signed inf/NaN */
if (f_exp >= 0x47800000u) {
if (f_exp == 0x7f800000u) {
/* Inf or NaN */
f_sig = (f & 0x007fffffu);
if (f_sig != 0) {
/* NaN - propagate the flag in the significand... */
npy_uint16 ret = (npy_uint16)(0x7c00u + (f_sig >> 13));
/* ...but make sure it stays a NaN */
if (ret == 0x7c00u) {
ret++;
}
return h_sgn + ret;
}
else {
/* signed inf */
return (npy_uint16)(h_sgn + 0x7c00u);
}
}
else {
/* overflow to signed inf */
#if NPY_HALF_GENERATE_OVERFLOW
npy_set_floatstatus_overflow();
#endif
return (npy_uint16)(h_sgn + 0x7c00u);
}
}
/* Exponent underflow converts to a subnormal half or signed zero */
if (f_exp <= 0x38000000u) {
/*
* Signed zeros, subnormal floats, and floats with small
* exponents all convert to signed zero halfs.
*/
if (f_exp < 0x33000000u) {
#if NPY_HALF_GENERATE_UNDERFLOW
/* If f != 0, it underflowed to 0 */
if ((f & 0x7fffffff) != 0) {
npy_set_floatstatus_underflow();
}
#endif
return h_sgn;
}
/* Make the subnormal significand */
f_exp >>= 23;
f_sig = (0x00800000u + (f & 0x007fffffu));
#if NPY_HALF_GENERATE_UNDERFLOW
/* If it's not exactly represented, it underflowed */
if ((f_sig&(((npy_uint32)1 << (126 - f_exp)) - 1)) != 0) {
npy_set_floatstatus_underflow();
}
#endif
f_sig >>= (113 - f_exp);
/* Handle rounding by adding 1 to the bit beyond half precision */
#if NPY_HALF_ROUND_TIES_TO_EVEN
/*
* If the last bit in the half significand is 0 (already even), and
* the remaining bit pattern is 1000...0, then we do not add one
* to the bit after the half significand. In all other cases, we do.
*/
if ((f_sig & 0x00003fffu) != 0x00001000u) {
f_sig += 0x00001000u;
}
#else
f_sig += 0x00001000u;
#endif
h_sig = (npy_uint16)(f_sig >> 13);
/*
* If the rounding causes a bit to spill into h_exp, it will
* increment h_exp from zero to one and h_sig will be zero.
* This is the correct result.
*/
return (npy_uint16)(h_sgn + h_sig);
}
/* Regular case with no overflow or underflow */
h_exp = (npy_uint16)((f_exp - 0x38000000u) >> 13);
/* Handle rounding by adding 1 to the bit beyond half precision */
f_sig = (f & 0x007fffffu);
#if NPY_HALF_ROUND_TIES_TO_EVEN
/*
* If the last bit in the half significand is 0 (already even), and
* the remaining bit pattern is 1000...0, then we do not add one
* to the bit after the half significand. In all other cases, we do.
*/
if ((f_sig & 0x00003fffu) != 0x00001000u) {
f_sig += 0x00001000u;
}
#else
f_sig += 0x00001000u;
#endif
h_sig = (npy_uint16)(f_sig >> 13);
/*
* If the rounding causes a bit to spill into h_exp, it will
* increment h_exp by one and h_sig will be zero. This is the
* correct result. h_exp may increment to 15, at greatest, in
* which case the result overflows to a signed inf.
*/
#if NPY_HALF_GENERATE_OVERFLOW
h_sig += h_exp;
if (h_sig == 0x7c00u) {
npy_set_floatstatus_overflow();
}
return h_sgn + h_sig;
#else
return h_sgn + h_exp + h_sig;
#endif
}
static inline npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
{
npy_uint64 d_exp, d_sig;
npy_uint16 h_sgn, h_exp, h_sig;
h_sgn = (d & 0x8000000000000000ULL) >> 48;
d_exp = (d & 0x7ff0000000000000ULL);
/* Exponent overflow/NaN converts to signed inf/NaN */
if (d_exp >= 0x40f0000000000000ULL) {
if (d_exp == 0x7ff0000000000000ULL) {
/* Inf or NaN */
d_sig = (d & 0x000fffffffffffffULL);
if (d_sig != 0) {
/* NaN - propagate the flag in the significand... */
npy_uint16 ret = (npy_uint16)(0x7c00u + (d_sig >> 42));
/* ...but make sure it stays a NaN */
if (ret == 0x7c00u) {
ret++;
}
return h_sgn + ret;
}
else {
/* signed inf */
return h_sgn + 0x7c00u;
}
}
else {
/* overflow to signed inf */
#if NPY_HALF_GENERATE_OVERFLOW
npy_set_floatstatus_overflow();
#endif
return h_sgn + 0x7c00u;
}
}
/* Exponent underflow converts to subnormal half or signed zero */
if (d_exp <= 0x3f00000000000000ULL) {
/*
* Signed zeros, subnormal floats, and floats with small
* exponents all convert to signed zero halfs.
*/
if (d_exp < 0x3e60000000000000ULL) {
#if NPY_HALF_GENERATE_UNDERFLOW
/* If d != 0, it underflowed to 0 */
if ((d & 0x7fffffffffffffffULL) != 0) {
npy_set_floatstatus_underflow();
}
#endif
return h_sgn;
}
/* Make the subnormal significand */
d_exp >>= 52;
d_sig = (0x0010000000000000ULL + (d & 0x000fffffffffffffULL));
#if NPY_HALF_GENERATE_UNDERFLOW
/* If it's not exactly represented, it underflowed */
if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) {
npy_set_floatstatus_underflow();
}
#endif
d_sig >>= (1009 - d_exp);
/* Handle rounding by adding 1 to the bit beyond half precision */
#if NPY_HALF_ROUND_TIES_TO_EVEN
/*
* If the last bit in the half significand is 0 (already even), and
* the remaining bit pattern is 1000...0, then we do not add one
* to the bit after the half significand. In all other cases, we do.
*/
if ((d_sig & 0x000007ffffffffffULL) != 0x0000020000000000ULL) {
d_sig += 0x0000020000000000ULL;
}
#else
d_sig += 0x0000020000000000ULL;
#endif
h_sig = (npy_uint16)(d_sig >> 42);
/*
* If the rounding causes a bit to spill into h_exp, it will
* increment h_exp from zero to one and h_sig will be zero.
* This is the correct result.
*/
return h_sgn + h_sig;
}
/* Regular case with no overflow or underflow */
h_exp = (npy_uint16)((d_exp - 0x3f00000000000000ULL) >> 42);
/* Handle rounding by adding 1 to the bit beyond half precision */
d_sig = (d & 0x000fffffffffffffULL);
#if NPY_HALF_ROUND_TIES_TO_EVEN
/*
* If the last bit in the half significand is 0 (already even), and
* the remaining bit pattern is 1000...0, then we do not add one
* to the bit after the half significand. In all other cases, we do.
*/
if ((d_sig & 0x000007ffffffffffULL) != 0x0000020000000000ULL) {
d_sig += 0x0000020000000000ULL;
}
#else
d_sig += 0x0000020000000000ULL;
#endif
h_sig = (npy_uint16)(d_sig >> 42);
/*
* If the rounding causes a bit to spill into h_exp, it will
* increment h_exp by one and h_sig will be zero. This is the
* correct result. h_exp may increment to 15, at greatest, in
* which case the result overflows to a signed inf.
*/
#if NPY_HALF_GENERATE_OVERFLOW
h_sig += h_exp;
if (h_sig == 0x7c00u) {
npy_set_floatstatus_overflow();
}
return h_sgn + h_sig;
#else
return h_sgn + h_exp + h_sig;
#endif
}
static inline npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
{
npy_uint16 h_exp, h_sig;
npy_uint32 f_sgn, f_exp, f_sig;
h_exp = (h & 0x7c00u);
f_sgn = ((npy_uint32)h & 0x8000u) << 16;
switch (h_exp) {
case 0x0000u: /* 0 or subnormal */
h_sig = (h & 0x03ffu);
/* Signed zero */
if (h_sig == 0) {
return f_sgn;
}
/* Subnormal */
h_sig <<= 1;
while ((h_sig & 0x0400u) == 0) {
h_sig <<= 1;
h_exp++;
}
f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23;
f_sig = ((npy_uint32)(h_sig & 0x03ffu)) << 13;
return f_sgn + f_exp + f_sig;
case 0x7c00u: /* inf or NaN */
/* All-ones exponent and a copy of the significand */
return f_sgn + 0x7f800000u + (((npy_uint32)(h & 0x03ffu)) << 13);
default: /* normalized */
/* Just need to adjust the exponent and shift */
return f_sgn + (((npy_uint32)(h & 0x7fffu) + 0x1c000u) << 13);
}
}
static inline npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
{
npy_uint16 h_exp, h_sig;
npy_uint64 d_sgn, d_exp, d_sig;
h_exp = (h & 0x7c00u);
d_sgn = ((npy_uint64)h & 0x8000u) << 48;
switch (h_exp) {
case 0x0000u: /* 0 or subnormal */
h_sig = (h & 0x03ffu);
/* Signed zero */
if (h_sig == 0) {
return d_sgn;
}
/* Subnormal */
h_sig <<= 1;
while ((h_sig & 0x0400u) == 0) {
h_sig <<= 1;
h_exp++;
}
d_exp = ((npy_uint64)(1023 - 15 - h_exp)) << 52;
d_sig = ((npy_uint64)(h_sig & 0x03ffu)) << 42;
return d_sgn + d_exp + d_sig;
case 0x7c00u: /* inf or NaN */
/* All-ones exponent and a copy of the significand */
return d_sgn + 0x7ff0000000000000ULL +
(((npy_uint64)(h & 0x03ffu)) << 42);
default: /* normalized */
/* Just need to adjust the exponent and shift */
return d_sgn + (((npy_uint64)(h & 0x7fffu) + 0xfc000u) << 42);
}
}
...@@ -60,7 +60,7 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -60,7 +60,7 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!"); "The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!"); CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); //CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * inter = NewTensor(output); XTensor * inter = NewTensor(output);
...@@ -564,8 +564,6 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, ...@@ -564,8 +564,6 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
"Wrong weight tensor!"); "Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1, CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"Wrong padding tensor!"); "Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
if(padding != NULL) { if(padding != NULL) {
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
......
...@@ -17,11 +17,13 @@ ...@@ -17,11 +17,13 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/ */
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/math/Clip.h" #include "../core/math/Clip.h"
#include "TClip.h" #include "TClip.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -31,6 +33,98 @@ Set every entry to its clip value. ...@@ -31,6 +33,98 @@ Set every entry to its clip value.
*/ */
bool TestClip1() bool TestClip1()
{ {
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Clip function */
_Clip(a, b, -1.0, 1.0);
_ClipMe(aMe, -1.0, 1.0);
bUser = Clip(*a, -1.0, 1.0);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Clip function */
_Clip(aGPU, bGPU, -1.0, 1.0);
_ClipMe(aMeGPU, -1.0, 1.0);
bUserGPU = Clip(*aGPU, -1.0, 1.0);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: float16 test Clip function.
Set every entry to its clip value.
*/
bool TestClip2()
{
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
...@@ -46,30 +140,91 @@ bool TestClip1() ...@@ -46,30 +140,91 @@ bool TestClip1()
{5.0F, -6.0F} }; {5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F}, DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F}, {0.0F, 1.0F},
{1.0F, -1.0F} }; {1.0F, -1.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */ #ifdef USE_CUDA
XTensor * a = NewTensor(aOrder, aDimSize); /* GPU test */
XTensor * b = NewTensor(aOrder, aDimSize); bool gpuTest = true;
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser; /* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* initialize variables */ /* create float16 tensor */
a->SetData(aData, aUnitNum); XTensor aHalfGPU;
aMe->SetData(aData, aUnitNum); XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* call Clip function */ /* Initialize variables */
_Clip(a, b, -1.0, 1.0); aGPU->SetData(aData, aUnitNum);
_ClipMe(aMe, -1.0, 1.0); aMeGPU->SetData(aData, aUnitNum);
bUser = Clip(*a, -1.0, 1.0);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call clip function */
_Clip(&aHalfGPU, &bHalfGPU, -1.0, 1.0);
_ClipMe(&aMeHalfGPU, -1.0, 1.0);
bUserHalfGPU = Clip(aHalfGPU, -1.0, 1.0);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F); bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 test Clip function.
Set every entry to its clip value.
*/
bool TestClip3()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -81,24 +236,118 @@ bool TestClip1() ...@@ -81,24 +236,118 @@ bool TestClip1()
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU; XTensor bUserGPU;
/* create int32 tensor */
XTensor aInt32GPU;
XTensor bInt32GPU;
XTensor aMeInt32GPU;
XTensor bUserInt32GPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum); aMeGPU->SetData(aData, aUnitNum);
/* call Clip function */ /* convert data type from float to int32 */
_Clip(aGPU, bGPU, -1.0, 1.0); aInt32GPU = ConvertDataType(*aGPU, X_INT);
_ClipMe(aMeGPU, -1.0, 1.0); aMeInt32GPU = ConvertDataType(*aMeGPU, X_INT);
bUserGPU = Clip(*aGPU, -1.0, 1.0); bInt32GPU = ConvertDataType(*bGPU, X_INT);
/* call clip function */
_Clip(&aInt32GPU, &bInt32GPU, -1.0, 1.0);
_ClipMe(&aMeInt32GPU, -1.0, 1.0);
bUserInt32GPU = Clip(aInt32GPU, -1.0, 1.0);
/* convert data type from int32 to float */
_ConvertDataType(&bInt32GPU, bGPU);
_ConvertDataType(&aMeInt32GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt32GPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 test Clip function.
Set every entry to its clip value.
*/
bool TestClip4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* create int8 tensor */
XTensor aInt8GPU;
XTensor bInt8GPU;
XTensor aMeInt8GPU;
XTensor bUserInt8GPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to int8 */
aInt8GPU = ConvertDataType(*aGPU, X_INT8);
aMeInt8GPU = ConvertDataType(*aMeGPU, X_INT8);
bInt8GPU = ConvertDataType(*bGPU, X_INT8);
/* call clip function */
_Clip(&aInt8GPU, &bInt8GPU, -1.0, 1.0);
_ClipMe(&aMeInt8GPU, -1.0, 1.0);
bUserInt8GPU = Clip(aInt8GPU, -1.0, 1.0);
/* convert data type from int8 to float */
_ConvertDataType(&bInt8GPU, bGPU);
_ConvertDataType(&aMeInt8GPU, aMeGPU);
bUserGPU = ConvertDataType(bUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F); bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete aMeGPU; delete aMeGPU;
...@@ -107,15 +356,13 @@ bool TestClip1() ...@@ -107,15 +356,13 @@ bool TestClip1()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -124,33 +371,63 @@ TODO!! ...@@ -124,33 +371,63 @@ TODO!!
/* test for Clip Function */ /* test for Clip Function */
bool TestClip() bool TestClip()
{ {
XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n"); XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestClip1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 1 test */ /* case 2 test */
caseFlag = TestClip1(); caseFlag = TestClip2();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* case 3 test */
/* caseFlag = TestClip3();
TODO!!
*/
if (returnFlag) { if (!caseFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestClip4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 int8 added
*/ */
#include "TConvertDataType.h" #include "TConvertDataType.h"
...@@ -26,77 +27,77 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -26,77 +27,77 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: test ConvertDataType function. case 1: test ConvertDataType function.
In this case, the flaot32 data type is converted to int32 data type. In this case, the float32 data type is converted to int32 data type.
*/ */
bool TestConvertDataType1() bool TestConvertDataType1()
{ {
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
aDimSize[0] = 3; aDimSize[0] = 3;
aDimSize[1] = 2; aDimSize[1] = 2;
int aUnitNum = 1; int aUnitNum = 1;
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, 2.0F}, DTYPE aData[3][2] = { {1.0F, 2.0F},
{0.5F, 4.0F}, {0.5F, 4.0F},
{5.0F, 6.0F} }; {5.0F, 6.0F} };
int answer[3][2] = { {1, 2}, int answer[3][2] = { {1, 2},
{0, 4}, {0, 4},
{5, 6} }; {5, 6} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */ /* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize); XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize, X_INT); XTensor * b = NewTensor(aOrder, aDimSize, X_INT);
/* initialize variables */ /* initialize variables */
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
b->SetZeroAll(); b->SetZeroAll();
/* call ConvertDataType function */ /* call ConvertDataType function */
_ConvertDataType(a, b); _ConvertDataType(a, b);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum); cpuTest = b->CheckData(answer, aUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensor */ /* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0); XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */ /* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU); _ConvertDataType(aGPU, bGPU);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum); gpuTest = bGPU->CheckData(answer, aUnitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete[] aDimSize; delete[] aDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
...@@ -106,78 +107,78 @@ In this case, the int32 data type is converted to float32 data type. ...@@ -106,78 +107,78 @@ In this case, the int32 data type is converted to float32 data type.
*/ */
bool TestConvertDataType2() bool TestConvertDataType2()
{ {
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
aDimSize[0] = 3; aDimSize[0] = 3;
aDimSize[1] = 2; aDimSize[1] = 2;
int aUnitNum = 1; int aUnitNum = 1;
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
int aData[3][2] = { {1, 2}, int aData[3][2] = { {1, 2},
{0, 4}, {0, 4},
{5, 6} }; {5, 6} };
DTYPE answer[3][2] = { {1.0F, 2.0F}, DTYPE answer[3][2] = { {1.0F, 2.0F},
{0.0F, 4.0F}, {0.0F, 4.0F},
{5.0F, 6.0F} }; {5.0F, 6.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */ /* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize, X_INT); XTensor * a = NewTensor(aOrder, aDimSize, X_INT);
XTensor * b = NewTensor(aOrder, aDimSize); XTensor * b = NewTensor(aOrder, aDimSize);
/* initialize variables */ /* initialize variables */
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
b->SetZeroAll(); b->SetZeroAll();
/* call ConvertDataType function */ /* call ConvertDataType function */
_ConvertDataType(a, b); _ConvertDataType(a, b);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensor */ /* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0); XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */ /* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU); _ConvertDataType(aGPU, bGPU);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete[] aDimSize; delete[] aDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 3: test ConvertDataType function. case 3: test ConvertDataType function.
In this case, the float data type is converted to float16 data type. In this case, the float32 data type is converted to float16 data type.
*/ */
bool TestConvertDataType3() bool TestConvertDataType3()
{ {
...@@ -290,6 +291,130 @@ bool TestConvertDataType3() ...@@ -290,6 +291,130 @@ bool TestConvertDataType3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 4: test ConvertDataType function.
In this case, the float32 data type is converted to int8 data type.
*/
bool TestConvertDataType4()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, 2.0F},
{0.5F, 4.0F},
{5.0F, 6.0F} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * dGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
_ConvertDataType(cGPU, dGPU);
/* check results */
gpuTest = dGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete dGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ConvertDataType function.
In this case, the int data type is converted to int8 data type.
*/
bool TestConvertDataType5()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
int aData[3][2] = { {1, 2},
{0, 4},
{5, 6} };
int answer[3][2] = { {1, 2},
{0, 4},
{5, 6} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT8, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
/* call ConvertDataType function */
_ConvertDataType(aGPU, bGPU);
_ConvertDataType(bGPU, cGPU);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -298,53 +423,73 @@ TODO!! ...@@ -298,53 +423,73 @@ TODO!!
/* test for ConvertDataType Function */ /* test for ConvertDataType Function */
bool TestConvertDataType() bool TestConvertDataType()
{ {
XPRINT(0, stdout, "[TEST ConvertDataType] convert data type \n"); XPRINT(0, stdout, "[TEST ConvertDataType] convert data type \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
caseFlag = TestConvertDataType1(); caseFlag = TestConvertDataType1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestConvertDataType2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestConvertDataType3();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 3 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 2 test */ /* case 4 test */
caseFlag = TestConvertDataType2(); caseFlag = TestConvertDataType4();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 4 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 3 test */ /* case 5 test */
caseFlag = TestConvertDataType3(); caseFlag = TestConvertDataType5();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n"); XPRINT(0, stdout, ">> case 5 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 3 passed!\n"); XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "TCrossEntropy.h" #include "TCrossEntropy.h"
#include "../loss/CrossEntropy.h" #include "../loss/CrossEntropy.h"
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -406,6 +407,329 @@ bool TestCrossEntropy4() ...@@ -406,6 +407,329 @@ bool TestCrossEntropy4()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 5: float16 test CrossEntropy function.
loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output.
*/
bool TestCrossEntropy5()
{
/* a tensor of size (1, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE outputData[4] = {0.25F, 0.25F, 0.25F, 0.25F};
DTYPE goldData[4] = {0.5F, 0.5F, 0.0F, 0.0F};
DTYPE answer = 1.3863F;
DTYPE error1;
DTYPE error2;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * lossGPU = NewTensor1D(1, X_FLOAT, 0);
/* create float16 tensor */
XTensor outputHalfGPU;
XTensor goldHalfGPU;
XTensor lossHalfGPU;
/* Initialize variables */
outputGPU->SetData(outputData, unitNum);
goldGPU->SetData(goldData, unitNum);
/* convert data type from float to float16 */
outputHalfGPU = ConvertDataType(*outputGPU, X_FLOAT16);
goldHalfGPU = ConvertDataType(*goldGPU, X_FLOAT16);
lossHalfGPU = ConvertDataType(*lossGPU, X_FLOAT16);
/* call CrossEntropy function */
_CrossEntropyFast(&outputHalfGPU, &goldHalfGPU, &lossHalfGPU);
error2 = _CrossEntropy(&outputHalfGPU, &goldHalfGPU, REDUCE_SUM);
/* convert data type from float16 to float */
_ConvertDataType(&lossHalfGPU, lossGPU);
error1 = lossGPU->Get1D(0);
/* check results */
gpuTest = (fabs(error1 - answer) < 1e-3F &&
fabs(error2 - answer) < 1e-3F);
/* destroy variables */
delete outputGPU;
delete goldGPU;
delete lossGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 test CrossEntropy function.
loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output.
*/
bool TestCrossEntropy6()
{
/* a tensor of size (4, 10) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 4;
dimSize[1] = 10;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE outputData[4][10] = { {0.5F, 2.6F, 0.3F, 1.7F, 0.6F,
0.1F, 0.7F, 1.3F, 0.4F, 0.6F},
{0.5F, 1.6F, 0.2F, 1.1F, 0.3F,
0.8F, 2.2F, 0.1F, 0.1F, 0.8F},
{0.2F, 0.5F, 1.1F, 1.2F, 0.6F,
0.1F, 0.2F, 0.7F, 0.5F, 0.7F},
{0.2F, 1.7F, 0.6F, 1.5F, 0.8F,
0.1F, 0.8F, 0.1F, 0.6F, 0.2F} };
DTYPE answer1 = 4.3275F;
DTYPE answer2 = 1.0818F;
DTYPE error1;
DTYPE error2;
DTYPE error3;
DTYPE error4;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensor */
XTensor outputHalfGPU;
XTensor goldHalfGPU;
/* Initialize variables */
outputGPU->SetData(outputData, unitNum);
goldGPU->SetZeroAll();
goldGPU->Set2D(1.0F, 0, 9);
goldGPU->Set2D(1.0F, 1, 7);
goldGPU->Set2D(1.0F, 2, 2);
goldGPU->Set2D(1.0F, 3, 9);
/* convert data type from float to float16 */
outputHalfGPU = ConvertDataType(*outputGPU, X_FLOAT16);
goldHalfGPU = ConvertDataType(*goldGPU, X_FLOAT16);
/* call CrossEntropy function */
error1 = _CrossEntropy(&outputHalfGPU, &goldHalfGPU, REDUCE_SUM);
error2 = _CrossEntropy(&outputHalfGPU, &goldHalfGPU, REDUCE_MEAN);
error3 = _CrossEntropyFast(&outputHalfGPU, &goldHalfGPU, REDUCE_SUM);
error4 = _CrossEntropyFast(&outputHalfGPU, &goldHalfGPU, REDUCE_MEAN);
/* check results */
gpuTest = (fabs(error1 - answer1) < 1e-2F &&
fabs(error2 - answer2) < 1e-2F &&
fabs(error3 - answer1) < 1e-2F &&
fabs(error4 - answer2) < 1e-2F);
/* destroy variables */
delete outputGPU;
delete goldGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: float16 test CrossEntropy function.
loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output.
In this case, I compute the cross entropy with weight.
*/
bool TestCrossEntropy7()
{
/* a output tensor of size (4, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 4;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* a weight tensor of size (4) */
int wOrder = 1;
int * wDimSize = new int[wOrder];
wDimSize[0] = 4;
int wUnitNum = 1;
for (int i = 0; i < wOrder; i++)
wUnitNum *= wDimSize[i];
DTYPE outputData[4][4] = { {0.3F, 0.2F, 0.3F, 0.2F},
{0.1F, 0.4F, 0.2F, 0.3F},
{0.7F, 0.1F, 0.1F, 0.1F},
{0.5F, 0.1F, 0.2F, 0.2F} };
DTYPE weightData[4] = {2.0F, 1.0F, 5.0F, 0.0F};
DTYPE answer[4] = {2.4079F, 0.9163F, 11.5129F, 0.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * lossGPU = NewTensor1D(4, X_FLOAT, 0);
XTensor * weightGPU = NewTensor(wOrder, wDimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensor */
XTensor outputHalfGPU;
XTensor goldHalfGPU;
XTensor lossHalfGPU;
XTensor weightHalfGPU;
/* Initialize variables */
outputGPU->SetData(outputData, unitNum);
weightGPU->SetData(weightData, wUnitNum);
goldGPU->SetZeroAll();
goldGPU->Set2D(1.0F, 0, 0);
goldGPU->Set2D(1.0F, 1, 1);
goldGPU->Set2D(1.0F, 2, 2);
goldGPU->Set2D(1.0F, 3, 3);
/* convert data type from float to float16 */
outputHalfGPU = ConvertDataType(*outputGPU, X_FLOAT16);
goldHalfGPU = ConvertDataType(*goldGPU, X_FLOAT16);
lossHalfGPU = ConvertDataType(*lossGPU, X_FLOAT16);
weightHalfGPU = ConvertDataType(*weightGPU, X_FLOAT16);
/* call CrossEntropy function */
_CrossEntropyFast(&outputHalfGPU, &goldHalfGPU, &lossHalfGPU, &weightHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&lossHalfGPU, lossGPU);
/* check results */
gpuTest = lossGPU->CheckData(answer, 4, 1e-2F);
/* destroy variables */
delete outputGPU;
delete goldGPU;
delete lossGPU;
delete weightGPU;
delete[] dimSize;
delete[] wDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
delete[] wDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 8: float16 test CrossEntropy function.
loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output.
*/
bool TestCrossEntropy8()
{
/* a tensor of size (10, 1) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 10;
dimSize[1] = 1;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
DTYPE answer = 0.0F;
DTYPE error;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensor */
XTensor outputHalfGPU;
XTensor goldHalfGPU;
/* Initialize variables */
outputGPU->SetZeroAll();
goldGPU->SetZeroAll();
_ScaleAndShiftMe(outputGPU, 1, 1);
_ScaleAndShiftMe(goldGPU, 1, 2);
/* convert data type from float to float16 */
outputHalfGPU = ConvertDataType(*outputGPU, X_FLOAT16);
goldHalfGPU = ConvertDataType(*goldGPU, X_FLOAT16);
/* call CrossEntropy function */
error = _CrossEntropyFast(&outputHalfGPU, &goldHalfGPU);
/* check results */
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete outputGPU;
delete goldGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -453,6 +777,42 @@ bool TestCrossEntropy() ...@@ -453,6 +777,42 @@ bool TestCrossEntropy()
else else
XPRINT(0, stdout, ">> case 4 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestCrossEntropy5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestCrossEntropy6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestCrossEntropy7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* case 8 test */
caseFlag = TestCrossEntropy8();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 8 failed!\n");
}
else
XPRINT(0, stdout, ">> case 8 passed!\n");
///* other cases test */ ///* other cases test */
///* ///*
//TODO!! //TODO!!
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TDiv.h" #include "TDiv.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -30,6 +32,132 @@ In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0. ...@@ -30,6 +32,132 @@ In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/ */
bool TestDiv1() bool TestDiv1()
{ {
/* a source tensor of size (2, 2) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 2;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {1.0F, 1.0F},
{4.0F, 9.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{0.5F, 0.3333F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * tMe = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
tMe->SetData(sData1, sUnitNum1);
s2->SetData(sData2, sUnitNum2);
t->SetZeroAll();
/* call Div function */
_Div(s1, s2, t, 0, 0);
_DivMe(tMe, s2, 0, 0);
tUser = Div(*s1, *s2, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) &&
tMe->CheckData(answer, tUnitNum, 1e-4F) &&
tUser.CheckData(answer, tUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* call Div function */
_Div(sGPU1, sGPU2, tGPU, 0, 0);
_DivMe(tMeGPU, sGPU2, 0, 0);
tUserGPU = Div(*sGPU1, *sGPU2, 0);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tMeGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tUserGPU.CheckData(answer, tUnitNum, 1e-4F);
/* destroy variables */
delete s1;
delete s2;
delete t;
delete tMe;
delete sGPU1;
delete sGPU2;
delete tGPU;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s1;
delete s2;
delete t;
delete tMe;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: float16 element-wise division of two tensors
c(i) = a(i)/b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestDiv2()
{
/* a source tensor of size (2, 2) */ /* a source tensor of size (2, 2) */
int sOrder1 = 2; int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1]; int * sDimSize1 = new int[sOrder1];
...@@ -70,29 +198,6 @@ bool TestDiv1() ...@@ -70,29 +198,6 @@ bool TestDiv1()
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * tMe = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
tMe->SetData(sData1, sUnitNum1);
s2->SetData(sData2, sUnitNum2);
t->SetZeroAll();
/* call Div function */
_Div(s1, s2, t, 0, 0);
_DivMe(tMe, s2, 0, 0);
tUser = Div(*s1, *s2, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) &&
tMe->CheckData(answer, tUnitNum, 1e-4F) &&
tUser.CheckData(answer, tUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -104,27 +209,41 @@ bool TestDiv1() ...@@ -104,27 +209,41 @@ bool TestDiv1()
XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0); XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU; XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU1;
XTensor sHalfGPU2;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1); sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1); tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2); sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll(); tGPU->SetZeroAll();
/* call Div function */ /* convert data type from float to float16 */
_Div(sGPU1, sGPU2, tGPU, 0, 0); sHalfGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
_DivMe(tMeGPU, sGPU2, 0, 0); sHalfGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
tUserGPU = Div(*sGPU1, *sGPU2, 0); tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
/* call div function */
_Div(&sHalfGPU1, &sHalfGPU2, &tHalfGPU, 0, 0);
_DivMe(&tMeHalfGPU, &sHalfGPU2, 0, 0);
tUserHalfGPU = Div(sHalfGPU1, sHalfGPU2, 0);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) && gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tMeGPU->CheckData(answer, tUnitNum, 1e-4F) && tMeGPU->CheckData(answer, tUnitNum, 1e-4F) &&
tUserGPU.CheckData(answer, tUnitNum, 1e-4F); tUserGPU.CheckData(answer, tUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete s1;
delete s2;
delete t;
delete tMe;
delete sGPU1; delete sGPU1;
delete sGPU2; delete sGPU2;
delete tGPU; delete tGPU;
...@@ -136,10 +255,6 @@ bool TestDiv1() ...@@ -136,10 +255,6 @@ bool TestDiv1()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete s1;
delete s2;
delete t;
delete tMe;
delete[] sDimSize1; delete[] sDimSize1;
delete[] sDimSize2; delete[] sDimSize2;
delete[] tDimSize; delete[] tDimSize;
...@@ -148,6 +263,7 @@ bool TestDiv1() ...@@ -148,6 +263,7 @@ bool TestDiv1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -156,33 +272,43 @@ TODO!! ...@@ -156,33 +272,43 @@ TODO!!
/* test for Div Function */ /* test for Div Function */
bool TestDiv() bool TestDiv()
{ {
XPRINT(0, stdout, "[TEST Div] element-wise division of two tensors \n"); XPRINT(0, stdout, "[TEST Div] element-wise division of two tensors \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
caseFlag = TestDiv1(); caseFlag = TestDiv1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDiv2();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -17,11 +17,13 @@ ...@@ -17,11 +17,13 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/ */
#include "TDivDim.h" #include "TDivDim.h"
#include "../core/arithmetic/DivDim.h" #include "../core/arithmetic/DivDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -127,8 +129,8 @@ bool TestDivDim1() ...@@ -127,8 +129,8 @@ bool TestDivDim1()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -241,8 +243,8 @@ bool TestDivDim2() ...@@ -241,8 +243,8 @@ bool TestDivDim2()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -251,6 +253,207 @@ bool TestDivDim2() ...@@ -251,6 +253,207 @@ bool TestDivDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: float16 tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2) = (2, 4), n = 0, alpha = 0.0.
*/
bool TestDivDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_DivDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_DivDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = DivDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: float16 tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2, 2) = (2, 4), n = 1.
*/
bool TestDivDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {0.0F, -1.0F, -2.0F, 3.0F},
{4.0F, -5.0F, -6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_DivDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_DivDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = DivDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -280,6 +483,24 @@ bool TestDivDim() ...@@ -280,6 +483,24 @@ bool TestDivDim()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestDivDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestDivDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
*/ */
#include "TGather.h" #include "TGather.h"
#include "../core/movement/Spread.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -140,6 +142,425 @@ bool TestGather1() ...@@ -140,6 +142,425 @@ bool TestGather1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool TestGather2()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = {0, 2};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
index->SetData(srcIndex, indexSize);
/* call Gather function */
_Gather(s, t, index);
tUser = Gather(*s, *index);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum) &&
tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
indexGPU->SetData(srcIndex, indexSize);
/* call Gather function */
_Gather(sGPU, tGPU, indexGPU);
tUserGPU = Gather(*sGPU, *indexGPU);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete index;
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete index;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: float16 gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool TestGather3()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = { 0, 2 };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
indexGPU->SetData(srcIndex, indexSize);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call gather function */
_Gather(&sHalfGPU, &tHalfGPU, indexGPU);
tUserHalfGPU = Gather(sHalfGPU, *indexGPU);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: gather indexed sub-tensors backward
In this case, (3, 3) -> (3, 3), dim = 0,
srcIndex = [0, 1, 2]
*/
bool TestGather4()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 3;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE tData[3][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE answer[3][3] = { {1.0F, 0.0F, 3.0F},
{3.0F, 2.0F, 4.0F},
{2.0F, 3.0F, 5.0F} };
int dim = 0;
int indexSize = 3;
int srcIndex[3] = {0, 1, 2};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize);
/* call gather function */
_SpreadForGather(sGPU, tGPU, indexGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: float16 gather indexed sub-tensors backward
In this case, (3, 3) -> (3, 3), dim = 0,
srcIndex = [0, 1, 2]
*/
bool TestGather5()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 3;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{1.0F, 2.0F, 4.0F} };
DTYPE tData[3][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE answer[3][3] = { {1.0F, 0.0F, 3.0F},
{3.0F, 2.0F, 4.0F},
{2.0F, 3.0F, 5.0F} };
int dim = 0;
int indexSize = 3;
int srcIndex[3] = {0, 1, 2};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
indexGPU->SetData(srcIndex, indexSize);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call gather function */
_SpreadForGather(&sHalfGPU, &tHalfGPU, indexGPU);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete indexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -159,6 +580,42 @@ bool TestGather() ...@@ -159,6 +580,42 @@ bool TestGather()
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestGather2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestGather3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestGather4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestGather5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */ /* other cases test */
/* /*
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#define __TEST_GATHER_H__ #define __TEST_GATHER_H__
#include "../core/movement/Gather.h" #include "../core/movement/Gather.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -17,10 +17,12 @@ ...@@ -17,10 +17,12 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/ */
#include "../XTensor.h" #include "../XTensor.h"
#include "THardTanH.h" #include "THardTanH.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -32,6 +34,192 @@ y = 1 if x > 1 ...@@ -32,6 +34,192 @@ y = 1 if x > 1
*/ */
bool TestHardTanH1() bool TestHardTanH1()
{ {
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
y->SetZeroAll();
/* call hardtanh function */
_HardTanH(x, y);
yUser = HardTanH(*x);
/* check results */
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* Initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* call hardtanh function */
_HardTanH(xGPU, yGPU);
yUserGPU = HardTanH(*xGPU);
/* check results */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) &&
yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: test backward computation of HardTanH function.
dE/dx = dE/dy * dy/dx
hard tanh: y = 1 if x > 1
x if -1 <= x <= 1
-1 if x< -1
and dy/dx = 1 if -1 <= x <= 1
0 otherwise
In this case, lossName=SQUAREDERROR.
*/
bool TestHardTanH2()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, 0.0F, -0.0F} };
DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, -2.0F, 0.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
y->SetZeroAll();
dedx->SetZeroAll();
dedy->SetData(dedyData, unitNum);
/* call HardTanH function */
_HardTanH(x, y);
/* call HardTanHBackward function */
_HardTanHBackward(y, x, dedy, dedx);
/* check results */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetData(dedyData, unitNum);
/* call HardTanH function */
_HardTanH(xGPU, yGPU);
/* call hardtanhbackward function */
_HardTanHBackward(yGPU, xGPU, dedyGPU, dedxGPU);
/* check results */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: float16 test HardTanH function.
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
*/
bool TestHardTanH3()
{
/* a tensor of size (2, 3) */ /* a tensor of size (2, 3) */
int order = 2; int order = 2;
int * dimSize = new int[order]; int * dimSize = new int[order];
...@@ -50,22 +238,6 @@ bool TestHardTanH1() ...@@ -50,22 +238,6 @@ bool TestHardTanH1()
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
y->SetZeroAll();
/* call hardtanh function */
_HardTanH(x, y);
yUser = HardTanH(*x);
/* check results */
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -75,20 +247,32 @@ bool TestHardTanH1() ...@@ -75,20 +247,32 @@ bool TestHardTanH1()
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU; XTensor yUserGPU;
/* create float16 tensor */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
xGPU->SetData(xData, unitNum); xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll(); yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call hardtanh function */ /* call hardtanh function */
_HardTanH(xGPU, yGPU); _HardTanH(&xHalfGPU, &yHalfGPU);
yUserGPU = HardTanH(*xGPU); yUserHalfGPU = HardTanH(xHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F); gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) &&
yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x;
delete y;
delete xGPU; delete xGPU;
delete yGPU; delete yGPU;
delete[] dimSize; delete[] dimSize;
...@@ -96,8 +280,6 @@ bool TestHardTanH1() ...@@ -96,8 +280,6 @@ bool TestHardTanH1()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete x;
delete y;
delete[] dimSize; delete[] dimSize;
return cpuTest; return cpuTest;
...@@ -105,7 +287,7 @@ bool TestHardTanH1() ...@@ -105,7 +287,7 @@ bool TestHardTanH1()
} }
/* /*
case 2: test backward computation of HardTanH function. case 4: float16 test backward computation of HardTanH function.
dE/dx = dE/dy * dy/dx dE/dx = dE/dy * dy/dx
hard tanh: y = 1 if x > 1 hard tanh: y = 1 if x > 1
x if -1 <= x <= 1 x if -1 <= x <= 1
...@@ -115,7 +297,7 @@ hard tanh: y = 1 if x > 1 ...@@ -115,7 +297,7 @@ hard tanh: y = 1 if x > 1
0 otherwise 0 otherwise
In this case, lossName=SQUAREDERROR. In this case, lossName=SQUAREDERROR.
*/ */
bool TestHardTanH2() bool TestHardTanH4()
{ {
/* a tensor of size (2, 3) */ /* a tensor of size (2, 3) */
int order = 2; int order = 2;
...@@ -127,71 +309,63 @@ bool TestHardTanH2() ...@@ -127,71 +309,63 @@ bool TestHardTanH2()
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F}, DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} }; {3.5F, -4.5F, 1.0F} };
DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F}, DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} }; {1.0F, -1.0F, 1.0F} };
DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F}, DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, 0.0F, -0.0F} }; {0.0F, 0.0F, -0.0F} };
DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F}, DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, -2.0F, 0.0F} }; {0.0F, -2.0F, 0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
y->SetZeroAll();
dedx->SetZeroAll();
dedy->SetData(dedyData, unitNum);
/* call HardTanH function */
_HardTanH(x, y);
/* call HardTanHBackward function */
_HardTanHBackward(y, x, dedy, dedx);
/* check results */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensors */ /* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor goldHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */ /* initialize variables */
xGPU->SetData(xData, unitNum); xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll(); yGPU->SetZeroAll();
dedxGPU->SetZeroAll(); dedxGPU->SetZeroAll();
dedyGPU->SetData(dedyData, unitNum); dedyGPU->SetData(dedyData, unitNum);
/* call HardTanH function */ /* convert data type from float to float16 */
_HardTanH(xGPU, yGPU); xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call hardtanh function */
_HardTanH(&xHalfGPU, &yHalfGPU);
/* call hardtanhbackward function */ /* call hardtanhbackward function */
_HardTanHBackward(yGPU, xGPU, dedyGPU, dedxGPU); _HardTanHBackward(&yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedyHalfGPU, dedyGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check results */ /* check results */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F); dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete xGPU; delete xGPU;
delete yGPU; delete yGPU;
delete dedxGPU; delete dedxGPU;
...@@ -201,16 +375,13 @@ bool TestHardTanH2() ...@@ -201,16 +375,13 @@ bool TestHardTanH2()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete[] dimSize; delete[] dimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -219,43 +390,63 @@ TODO!! ...@@ -219,43 +390,63 @@ TODO!!
/* test for HardTanH Function */ /* test for HardTanH Function */
bool TestHardTanH() bool TestHardTanH()
{ {
XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n"); XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestHardTanH1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 1 test */ /* case 2 test */
caseFlag = TestHardTanH1(); caseFlag = TestHardTanH2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestHardTanH3();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 3 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 2 test */ /* case 4 test */
caseFlag = TestHardTanH2(); caseFlag = TestHardTanH4();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 4 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -17,10 +17,12 @@ ...@@ -17,10 +17,12 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16 added
*/ */
#include "../XUtility.h" #include "../XUtility.h"
#include "TLogSoftmax.h" #include "TLogSoftmax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -62,7 +64,7 @@ bool TestLogSoftmax1() ...@@ -62,7 +64,7 @@ bool TestLogSoftmax1()
yUser = LogSoftmax(*x, 1); yUser = LogSoftmax(*x, 1);
/* check result */ /* check result */
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F); cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -82,7 +84,7 @@ bool TestLogSoftmax1() ...@@ -82,7 +84,7 @@ bool TestLogSoftmax1()
yUserGPU = LogSoftmax(*xGPU, 1); yUserGPU = LogSoftmax(*xGPU, 1);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F); gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -206,7 +208,7 @@ bool TestLogSoftmax2() ...@@ -206,7 +208,7 @@ bool TestLogSoftmax2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 3: test LogSoftmaxBackward function. case 3: test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}) log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
...@@ -248,12 +250,12 @@ bool TestLogSoftmax3() ...@@ -248,12 +250,12 @@ bool TestLogSoftmax3()
/* call LogSoftmax function */ /* call LogSoftmax function */
_LogSoftmax(x, y, 1); _LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR); _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F); && dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -279,10 +281,10 @@ bool TestLogSoftmax3() ...@@ -279,10 +281,10 @@ bool TestLogSoftmax3()
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR); _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-3F); && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-3F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -311,6 +313,256 @@ bool TestLogSoftmax3() ...@@ -311,6 +313,256 @@ bool TestLogSoftmax3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 4: float16 test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
*/
bool TestLogSoftmax4()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F},
{-1.5435F, -1.3435F, -0.6435F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
yUserHalfGPU = LogSoftmax(xHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-2F) &&
yUserGPU.CheckData(answer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=CROSSENTROPY.
*/
bool TestLogSoftmax5()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[1][3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.4100F, -0.5553F, -0.8348F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, CROSSENTROPY);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
In this case, LossName=SQUAREDERROR
*/
bool TestLogSoftmax6()
{
/* a tensor of size (1, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[1][3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.4100F, -0.5553F, -0.8348F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
/* call logsoftmax function */
_LogSoftmax(&xHalfGPU, &yHalfGPU, 1);
/* call logsoftmaxbackward function */
_LogSoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, SQUAREDERROR);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-2F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-2F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -352,6 +604,36 @@ bool TestLogSoftmax() ...@@ -352,6 +604,36 @@ bool TestLogSoftmax()
else else
XPRINT(0, stdout, ">> case 3 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestLogSoftmax4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestLogSoftmax5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestLogSoftmax6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-07 float16/int8 added
*/ */
#include "TMatrixMul.h" #include "TMatrixMul.h"
...@@ -507,6 +508,304 @@ bool TestMatrixMul4() ...@@ -507,6 +508,304 @@ bool TestMatrixMul4()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 5: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float16 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul5()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor halfSGPU1;
XTensor halfSGPU2;
XTensor halfTGPU;
XTensor halfTUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
halfSGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
halfSGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
halfTGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call MatrixMul function */
_MatrixMul(&halfSGPU1, X_NOTRANS, &halfSGPU2, X_NOTRANS, &halfTGPU);
halfTUserGPU = MatrixMul(halfSGPU1, X_NOTRANS, halfSGPU2, X_NOTRANS);
/* convert data type from float16 to float */
_ConvertDataType(&halfTGPU, tGPU);
tUserGPU = ConvertDataType(halfTUserGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 matrix multiplication.
In this case, float16 a=(2, 3), float16 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul6()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensors */
XTensor halfSGPU1;
XTensor halfSGPU2;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
halfSGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
halfSGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
/* call MatrixMul function */
_MatrixMul(&halfSGPU1, X_NOTRANS, &halfSGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMul(halfSGPU1, X_NOTRANS, halfSGPU2, X_NOTRANS, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: int8 matrix multiplication.
In this case, int8 a=(2, 3), int8 b=(3, 2) -> float32 c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul7()
{
/* a source tensor of size (2, 3) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (3, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 3;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1, 2, 3},
{-4, 5, 6} };
DTYPE sData2[3][2] = { {0, -1},
{1, 2},
{2, 1} };
DTYPE answer[2][2] = { {8, 6},
{17, 20} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int8 tensors */
XTensor int8SGPU1;
XTensor int8SGPU2;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to int8 */
int8SGPU1 = ConvertDataType(*sGPU1, X_INT8);
int8SGPU2 = ConvertDataType(*sGPU2, X_INT8);
/* call MatrixMul function */
_MatrixMul(&int8SGPU1, X_NOTRANS, &int8SGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMul(int8SGPU1, X_NOTRANS, int8SGPU2, X_NOTRANS, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
...@@ -556,6 +855,33 @@ bool TestMatrixMul() ...@@ -556,6 +855,33 @@ bool TestMatrixMul()
else else
XPRINT(0, stdout, ">> case 4 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestMatrixMul5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestMatrixMul6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestMatrixMul7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#define __TEST_MATRIXMUL_H__ #define __TEST_MATRIXMUL_H__
#include "../core/arithmetic/MatrixMul.h" #include "../core/arithmetic/MatrixMul.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "TMultiply.h" #include "TMultiply.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -148,6 +149,118 @@ bool TestMultiply1() ...@@ -148,6 +149,118 @@ bool TestMultiply1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestMultiply2()
{
/* a source tensor of size (2, 2) */
int sOrder1 = 2;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 2;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 2) */
int sOrder2 = 2;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 2;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 2) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{4.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU1;
XTensor sHalfGPU2;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
tGPU->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU1 = ConvertDataType(*sGPU1, X_FLOAT16);
sHalfGPU2 = ConvertDataType(*sGPU2, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
/* call multiply function */
_Multiply(&sHalfGPU1, &sHalfGPU2, &tHalfGPU, 0, 0);
_MultiplyMe(&tMeHalfGPU, &sHalfGPU2, 0, 0);
tUserHalfGPU = Multiply(sHalfGPU1, sHalfGPU2, 0);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum) &&
tMeGPU->CheckData(answer, tUnitNum) &&
tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sGPU1;
delete sGPU2;
delete tGPU;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -169,6 +282,16 @@ bool TestMultiply() ...@@ -169,6 +282,16 @@ bool TestMultiply()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestMultiply2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,11 +17,13 @@ ...@@ -17,11 +17,13 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/ */
#include "TMultiplyDim.h" #include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h" #include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
...@@ -248,6 +250,205 @@ bool TestMultiplyDim2() ...@@ -248,6 +250,205 @@ bool TestMultiplyDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: float16 tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool TestMultiplyDim3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: flaot16 tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool TestMultiplyDim4()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[4] = {1.0F, -1.0F , 1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
{4.0F, -5.0F, 6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call multiplydim function */
_MultiplyDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_MultiplyDimMe(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = MultiplyDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */ /* test for MultiplyDim Function */
bool TestMultiplyDim() bool TestMultiplyDim()
{ {
...@@ -272,6 +473,24 @@ bool TestMultiplyDim() ...@@ -272,6 +473,24 @@ bool TestMultiplyDim()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestMultiplyDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestMultiplyDim4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,98 +17,184 @@ ...@@ -17,98 +17,184 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/ */
#include "TNegate.h" #include "TNegate.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set every entry to its minus value */ /* case 1: set every entry to its minus value */
bool TestNegate1() bool TestNegate1()
{ {
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
aDimSize[0] = 3; aDimSize[0] = 3;
aDimSize[1] = 2; aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{-3.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {-1.0F, 2.0F},
{3.0F, -4.0F},
{-5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
int aUnitNum = 1; /* initialize variables */
for (int i = 0; i < aOrder; i++) a->SetData(aData, aUnitNum);
aUnitNum *= aDimSize[i]; aMe->SetData(aData, aUnitNum);
DTYPE aData[3][2] = { {1.0F, -2.0F}, /* call Negate function */
{-3.0F, 4.0F}, _Negate(a, b);
{5.0F, -6.0F} }; _NegateMe(aMe);
DTYPE answer[3][2] = { {-1.0F, 2.0F}, bUser = Negate(*a);
{3.0F, -4.0F},
{-5.0F, 6.0F} };
/* CPU test */ /* check results */
bool cpuTest = true; cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(aGPU, bGPU);
_NegateMe(aMeGPU);
bUserGPU = Negate(*aGPU);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* create tensors */ /* destroy variables */
XTensor * a = NewTensor(aOrder, aDimSize); delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 2: set every entry to its minus value */
bool TestNegate2()
{
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 0.0F},
{0.0F, 0.0F},
{0.0F, 0.0F} };
DTYPE answer[3][2] = { {-0.0F, -0.0F},
{-0.0F, -0.0F},
{-0.0F, -0.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize); XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize); XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser; XTensor bUser;
/* initialize variables */ /* initialize variables */
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum); aMe->SetData(aData, aUnitNum);
/* call Negate function */ /* call Negate function */
_Negate(a, b); _Negate(a, b);
_NegateMe(aMe); _NegateMe(aMe);
bUser = Negate(*a); bUser = Negate(*a);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensor */ /* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU; XTensor bUserGPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum); aMeGPU->SetData(aData, aUnitNum);
/* call Negate function */ /* call Negate function */
_Negate(aGPU, bGPU); _Negate(aGPU, bGPU);
_NegateMe(aMeGPU); _NegateMe(aMeGPU);
bUserGPU = Negate(*aGPU); bUserGPU = Negate(*aGPU);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete aMe; delete aMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete aMeGPU; delete aMeGPU;
delete[] aDimSize; delete[] aDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete aMe; delete aMe;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: set every entry to its minus value */ /* case 3: float16 set every entry to its minus value */
bool TestNegate2() bool TestNegate3()
{ {
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
...@@ -120,34 +206,16 @@ bool TestNegate2() ...@@ -120,34 +206,16 @@ bool TestNegate2()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 0.0F}, DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 0.0F}, {-3.0F, 4.0F},
{0.0F, 0.0F} }; {5.0F, -6.0F} };
DTYPE answer[3][2] = { {-0.0F, -0.0F}, DTYPE answer[3][2] = { {-1.0F, 2.0F},
{-0.0F, -0.0F}, {3.0F, -4.0F},
{-0.0F, -0.0F} }; {-5.0F, 6.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(a, b);
_NegateMe(aMe);
bUser = Negate(*a);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -158,22 +226,37 @@ bool TestNegate2() ...@@ -158,22 +226,37 @@ bool TestNegate2()
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU; XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum); aMeGPU->SetData(aData, aUnitNum);
/* call Negate function */ /* convert data type from float to float16 */
_Negate(aGPU, bGPU); aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
_NegateMe(aMeGPU); aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bUserGPU = Negate(*aGPU); bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call negate function */
_Negate(&aHalfGPU, &bHalfGPU);
_NegateMe(&aMeHalfGPU);
bUserHalfGPU = Negate(aHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete aMeGPU; delete aMeGPU;
...@@ -182,15 +265,14 @@ bool TestNegate2() ...@@ -182,15 +265,14 @@ bool TestNegate2()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -199,43 +281,53 @@ TODO!! ...@@ -199,43 +281,53 @@ TODO!!
/* test for Negate Function */ /* test for Negate Function */
bool TestNegate() bool TestNegate()
{ {
XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n"); XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
caseFlag = TestNegate1(); caseFlag = TestNegate1();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 1 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestNegate2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 2 test */ /* case 3 test */
caseFlag = TestNegate2(); caseFlag = TestNegate3();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 3 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TReduceMax.h" #include "TReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -86,8 +88,8 @@ bool TestReduceMax1() ...@@ -86,8 +88,8 @@ bool TestReduceMax1()
tUser2 = ReduceMax(*s, 1); tUser2 = ReduceMax(*s, 1);
/* check results */ /* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2); t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -112,8 +114,8 @@ bool TestReduceMax1() ...@@ -112,8 +114,8 @@ bool TestReduceMax1()
tUserGPU2 = ReduceMax(*sGPU, 1); tUserGPU2 = ReduceMax(*sGPU, 1);
/* check results */ /* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2); tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */ /* destroy variables */
delete s; delete s;
...@@ -140,6 +142,113 @@ bool TestReduceMax1() ...@@ -140,6 +142,113 @@ bool TestReduceMax1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceMax2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a output tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
{4.0F, 1.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
DTYPE answer2[2] = {5.0F, 7.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type form float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducemax function */
_ReduceMax(&sHalfGPU, &tHalfGPU1, 0);
_ReduceMax(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceMax(sHalfGPU, 0);
tUserHalfGPU2 = ReduceMax(sHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -160,6 +269,15 @@ bool TestReduceMax() ...@@ -160,6 +269,15 @@ bool TestReduceMax()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMax2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,10 +17,12 @@ ...@@ -17,10 +17,12 @@
/* /*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16 added
*/ */
#include "TReduceSum.h" #include "TReduceSum.h"
#include "../core/getandset/SetData.h" #include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -514,7 +516,6 @@ bool TestReduceSum5() ...@@ -514,7 +516,6 @@ bool TestReduceSum5()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* /*
case 6: test ReduceSum function. case 6: test ReduceSum function.
Sum the items along a dimension of the tensor. Sum the items along a dimension of the tensor.
...@@ -607,6 +608,126 @@ bool TestReduceSum6() ...@@ -607,6 +608,126 @@ bool TestReduceSum6()
} }
/*
case 7: float16 test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
bool TestReduceSum7()
{
/* a tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (4) */
int tOrder1 = 1;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 4;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a tensor of size (2) */
int tOrder2 = 1;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
DTYPE answer2[2] = {6.0F, 22.0F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensors */
XTensor sHalfGPU;
XTensor shiftHalfGPU1;
XTensor shiftHalfGPU2;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU1->SetZeroAll();
shiftGPU2->SetZeroAll();
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
shiftHalfGPU1 = ConvertDataType(*shiftGPU1, X_FLOAT16);
shiftHalfGPU2 = ConvertDataType(*shiftGPU2, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call reducesum function */
_ReduceSum(&sHalfGPU, &tHalfGPU1, 0);
_ReduceSum(&sHalfGPU, &tHalfGPU2, 1);
tUserHalfGPU1 = ReduceSum(sHalfGPU, 0, shiftHalfGPU1);
tUserHalfGPU2 = ReduceSum(sHalfGPU, 1, shiftHalfGPU2);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete shiftGPU1;
delete shiftGPU2;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -672,6 +793,15 @@ bool TestReduceSum() ...@@ -672,6 +793,15 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 6 passed!\n"); XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestReduceSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16/int/int8 added
*/ */
#include "TScaleAndShift.h" #include "TScaleAndShift.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -113,6 +115,254 @@ bool TestScaleAndShift1() ...@@ -113,6 +115,254 @@ bool TestScaleAndShift1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: flaot16 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift2()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
{8.5F, 10.5F, 12.5F, 14.5F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create float16 tensor */
XTensor sHalfGPU;
XTensor tHalfGPU;
XTensor tMeHalfGPU;
XTensor tUserHalfGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tMeHalfGPU = ConvertDataType(*tMeGPU, X_FLOAT16);
tHalfGPU = ConvertDataType(*tGPU, X_FLOAT16);
/* call scaleandshift function */
_ScaleAndShift(&sHalfGPU, &tHalfGPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeHalfGPU, scaleFactor, shiftFactor);
tUserHalfGPU = ScaleAndShift(sHalfGPU, scaleFactor, shiftFactor);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU, tGPU);
_ConvertDataType(&tMeHalfGPU, tMeGPU);
tUserGPU = ConvertDataType(tUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: int32 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift3()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int32 tensor */
XTensor sInt32GPU;
XTensor tInt32GPU;
XTensor tMeInt32GPU;
XTensor tUserInt32GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int32 */
sInt32GPU = ConvertDataType(*sGPU, X_INT);
tMeInt32GPU = ConvertDataType(*tMeGPU, X_INT);
tInt32GPU = ConvertDataType(tGPU, X_INT);
/* call scaleandshift function */
_ScaleAndShift(&sInt32GPU, &tInt32GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt32GPU, scaleFactor, shiftFactor);
tUserInt32GPU = ScaleAndShift(sInt32GPU, scaleFactor, shiftFactor);
/* convert data type from int32 to float */
_ConvertDataType(&tInt32GPU, tGPU);
_ConvertDataType(&tMeInt32GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt32GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: int8 scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift4()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {1.0F, 3.0F, 5.0F, 7.0F},
{9.0F, 11.0F, 13.0F, 15.0F} };
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 1.8F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* create int8 tensor */
XTensor sInt8GPU;
XTensor tInt8GPU;
XTensor tMeInt8GPU;
XTensor tUserInt8GPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tMeGPU->SetData(sData, sUnitNum);
/* convert data type from float to int8 */
sInt8GPU = ConvertDataType(*sGPU, X_INT8);
tMeInt8GPU = ConvertDataType(*tMeGPU, X_INT8);
tInt8GPU = ConvertDataType(*tGPU, X_INT8);
/* call scaleandshift function */
_ScaleAndShift(&sInt8GPU, &tInt8GPU, scaleFactor, shiftFactor);
_ScaleAndShiftMe(&tMeInt8GPU, scaleFactor, shiftFactor);
tUserInt8GPU = ScaleAndShift(sInt8GPU, scaleFactor, shiftFactor);
/* convert data type from int8 to float */
_ConvertDataType(&tInt8GPU, tGPU);
_ConvertDataType(&tMeInt8GPU, tMeGPU);
tUserGPU = ConvertDataType(tUserInt8GPU, X_FLOAT);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum) &&
tMeGPU->CheckData(answer, sUnitNum) &&
tUserGPU.CheckData(answer, sUnitNum);
/* destroy variables */
delete sGPU;
delete tGPU;
delete tMeGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -133,6 +383,33 @@ bool TestScaleAndShift() ...@@ -133,6 +383,33 @@ bool TestScaleAndShift()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestScaleAndShift2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestScaleAndShift3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestScaleAndShift4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
...@@ -17,10 +17,12 @@ ...@@ -17,10 +17,12 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
*/ */
#include "TSetData.h" #include "TSetData.h"
#include "../core/getandset/SetData.h" #include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -406,6 +408,427 @@ bool TestSetData5() ...@@ -406,6 +408,427 @@ bool TestSetData5()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 6: float16 test SetDataRand function.
set the tensor items by a uniform distribution in range [lower, upper].
*/
bool TestSetData6()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE answer[2][4] = {0};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
/* call setdatarand function */
_SetDataRand(&sHalfGPU, 0.0, 1.0);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, sUnitNum, 1.0F);
/* destroy variables */
delete sGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: float16 test SetDataRandP function.
first set the tensor items by a uniform distribution in range [lower, upper].
then set the item to a pre-defined value if the item >= p, set the item to 0 otherwise
*/
bool TestSetData7()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE answer[2][4] = {0};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
/* call setdatarandp function */
_SetDataRandP(&sHalfGPU, 0.0, 1.0, 0.5, 1.0);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, sUnitNum, 1.1F);
/* destroy variables */
delete sGPU;
delete[] sDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 8: float16 test SetDataIndexed function.
modify data items along with a given dimension.
*/
bool TestSetData8()
{
/* a input tensor of size (2, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a data tensor of size (4) for GPU test */
int dataOrder = 1;
int * dataDimSize = new int[dataOrder];
dataDimSize[0] = 4;
int dataUnitNum = 1;
for (int i = 0; i < dataOrder; i++)
dataUnitNum *= dataDimSize[i];
DTYPE data[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE answer[2][4] = { {1.0F, 1.0F, 1.0F, 1.0F},
{0.0F, 1.0F, 2.0F, 3.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
XTensor modifyHalfGPU;
/* Initialize modifyGPU */
modifyGPU->SetData(data, dataUnitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
modifyHalfGPU = ConvertDataType(*modifyGPU, X_FLOAT16);
/* Initialize sHalfGPU */
_SetDataFixed(&sHalfGPU, 1.0F);
/* call setdataindexed function */
_SetDataIndexed(&sHalfGPU, &modifyHalfGPU, 0, 1);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
/* destroy variables */
delete sGPU;
delete modifyGPU;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 9: float16 test SetDataIndexed function.
modify data items along with a given dimension.
*/
bool TestSetData9()
{
/* a input tensor of size (2, 4, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 4;
sDimSize[2] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a data tensor of size (2, 3) for GPU test */
int dataOrder = 2;
int * dataDimSize = new int[dataOrder];
dataDimSize[0] = 2;
dataDimSize[1] = 3;
int dataUnitNum = 1;
for (int i = 0; i < dataOrder; i++)
dataUnitNum *= dataDimSize[i];
DTYPE data[2][3] = { { 0.0F, 1.0F, 2.0F },
{ 3.0F, 4.0F, 5.0F } };
DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
{0.0F, 1.0F, 2.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} },
{ {1.0F, 1.0F, 1.0F},
{3.0F, 4.0F, 5.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} } };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
XTensor modifyHalfGPU;
/* Initialize modifyGPU */
modifyGPU->SetData(data, dataUnitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
modifyHalfGPU = ConvertDataType(*modifyGPU, X_FLOAT16);
/* Initialize sHalfGPU */
_SetDataFixed(&sHalfGPU, 1.0F);
/* call setdataindexed function */
_SetDataIndexed(&sHalfGPU, &modifyHalfGPU, 1, 1);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
/* destroy variables */
delete sGPU;
delete modifyGPU;
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] dataDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 10: float16 test SetDataDim function.
set data items along with a given dimension (and keep the remaining items unchanged)
*/
bool TestSetData10()
{
/* a input tensor of size (3, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 3;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE sData[3][3] = { {1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F},
{7.0F, 8.0F, 9.0F} };
DTYPE answer[3][3] = { {1.0F, 2.0F, 3.0F},
{0.0F, 0.0F, 0.0F},
{7.0F, 8.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
/* initialize variables */
sGPU->SetData(sData, unitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
/* call _setdatadim function */
_SetDataDim(&sHalfGPU, 1, 1, 0, 0);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete sGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 11: float16 test SetDataDim function.
set data items along with a given dimension (and keep the remaining items unchanged)
*/
bool TestSetData11()
{
/* a input tensor of size (2, 4, 3) */
int order = 3;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
dimSize[2] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE data[2][4][3] = { { {1.0F, 1.0F, 1.0F},
{0.0F, 1.0F, 2.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} },
{ {1.0F, 1.0F, 1.0F},
{3.0F, 4.0F, 5.0F},
{1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} } };
DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
{0.0F, 1.0F, 2.0F},
{5.0F, 5.0F, 5.0F},
{1.0F, 1.0F, 1.0F} },
{ {1.0F, 1.0F, 1.0F},
{3.0F, 4.0F, 5.0F},
{5.0F, 5.0F, 5.0F},
{1.0F, 1.0F, 1.0F} } };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor sHalfGPU;
/* initialize variables */
sGPU->SetData(data, unitNum);
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
/* call _setdatadim function */
_SetDataDim(&sHalfGPU, 2, 1, 1, 5.0F);
/* convert data type from float16 to float */
_ConvertDataType(&sHalfGPU, sGPU);
/* check results */
gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete sGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -462,6 +885,60 @@ bool TestSetData() ...@@ -462,6 +885,60 @@ bool TestSetData()
else else
XPRINT(0, stdout, ">> case 5 passed!\n"); XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestSetData6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestSetData7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* case 8 test */
caseFlag = TestSetData8();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 8 failed!\n");
}
else
XPRINT(0, stdout, ">> case 8 passed!\n");
/* case 9 test */
caseFlag = TestSetData9();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 9 failed!\n");
}
else
XPRINT(0, stdout, ">> case 9 passed!\n");
/* case 10 test */
caseFlag = TestSetData10();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 10 failed!\n");
}
else
XPRINT(0, stdout, ">> case 10 passed!\n");
/* case 11 test */
caseFlag = TestSetData11();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 11 failed!\n");
}
else
XPRINT(0, stdout, ">> case 11 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/ */
#include "TSign.h" #include "TSign.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -29,6 +31,93 @@ Set every entry to its sign value. ...@@ -29,6 +31,93 @@ Set every entry to its sign value.
*/ */
bool TestSign1() bool TestSign1()
{ {
/* a tensor of size (3, 2) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F},
{0.0F, 1.0F},
{1.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Sign function */
_Sign(a, b);
_SignMe(aMe);
bUser = Sign(*a);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Sign function */
_Sign(aGPU, bGPU);
_SignMe(aMeGPU);
bUserGPU = Sign(*aGPU);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: float16 test Sign function.
Set every entry to its sign value.
*/
bool TestSign2()
{
/* a tensor of size (3, 2) */ /* a tensor of size (3, 2) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
...@@ -39,7 +128,7 @@ bool TestSign1() ...@@ -39,7 +128,7 @@ bool TestSign1()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0F, -2.0F}, DTYPE aData[3][2] = { {1.0F, -2.0F},
{0.0F, 4.0F}, {0.0F, 4.0F},
{5.0F, -6.0F} }; {5.0F, -6.0F} };
DTYPE answer[3][2] = { {1.0F, -1.0F}, DTYPE answer[3][2] = { {1.0F, -1.0F},
...@@ -49,24 +138,6 @@ bool TestSign1() ...@@ -49,24 +138,6 @@ bool TestSign1()
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Sign function */
_Sign(a, b);
_SignMe(aMe);
bUser = Sign(*a);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -77,22 +148,37 @@ bool TestSign1() ...@@ -77,22 +148,37 @@ bool TestSign1()
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU; XTensor bUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor aMeHalfGPU;
XTensor bUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum); aMeGPU->SetData(aData, aUnitNum);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
aMeHalfGPU = ConvertDataType(*aMeGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
/* call Sign function */ /* call Sign function */
_Sign(aGPU, bGPU); _Sign(&aHalfGPU, &bHalfGPU);
_SignMe(aMeGPU); _SignMe(&aMeHalfGPU);
bUserGPU = Sign(*aGPU); bUserHalfGPU = Sign(aHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&bHalfGPU, bGPU);
_ConvertDataType(&aMeHalfGPU, aMeGPU);
bUserGPU = ConvertDataType(bUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete aMeGPU; delete aMeGPU;
...@@ -101,15 +187,13 @@ bool TestSign1() ...@@ -101,15 +187,13 @@ bool TestSign1()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize; delete[] aDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -118,33 +202,43 @@ TODO!! ...@@ -118,33 +202,43 @@ TODO!!
/* test for Sign Function */ /* test for Sign Function */
bool TestSign() bool TestSign()
{ {
XPRINT(0, stdout, "[TEST Sign] set every entry to its sign value \n"); XPRINT(0, stdout, "[TEST Sign] set every entry to its sign value \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSign1();
/* case 1 test */ if (!caseFlag) {
caseFlag = TestSign1(); returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSign2();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "../XTensor.h" #include "../XTensor.h"
#include "../XUtility.h" #include "../XUtility.h"
#include "TSoftmax.h" #include "TSoftmax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -207,6 +208,167 @@ bool TestSoftmax2() ...@@ -207,6 +208,167 @@ bool TestSoftmax2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: float16 test Softmax function.
softmax function: y = e^x / \sum_{i} e^{x_i}
*/
bool TestSoftmax3()
{
/* a tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {0.0900F, 0.2447F, 0.6652F},
{0.2136F, 0.2609F, 0.5254F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor yUserHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
yGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
/* call softmax function */
_Softmax(&xHalfGPU, &yHalfGPU, 1);
yUserHalfGPU = Softmax(xHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&yHalfGPU, yGPU);
yUserGPU = ConvertDataType(yUserHalfGPU, X_FLOAT);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-3F) &&
yUserGPU.CheckData(answer, unitNum, 1e-3F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: float16 test SoftmaxBackward function.
SoftmaxBackward function: dE/dx_j = -gold_j + y_j
In this case, LossName=CROSSENTROPY.
*/
bool TestSoftmax4()
{
/* a input tensor of size (2, 3) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 1;
dimSize[1] = 3;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
DTYPE yAnswer[1][3] = { {0.0900F, 0.2447F, 0.6652F} };
DTYPE dedxAnswer[1][3] = {0.0900F, 0.2447F, -0.3347F};
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* create float16 tensors */
XTensor xHalfGPU;
XTensor yHalfGPU;
XTensor gHalfGPU;
XTensor dedyHalfGPU;
XTensor dedxHalfGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
/* convert data type from float to float16 */
xHalfGPU = ConvertDataType(*xGPU, X_FLOAT16);
yHalfGPU = ConvertDataType(*yGPU, X_FLOAT16);
gHalfGPU = ConvertDataType(*gGPU, X_FLOAT16);
dedxHalfGPU = ConvertDataType(*dedxGPU, X_FLOAT16);
dedyHalfGPU = ConvertDataType(*dedyGPU, X_FLOAT16);
/* call softmax function */
_Softmax(&xHalfGPU, &yHalfGPU, 1);
/* call SoftmaxBackward function */
_SoftmaxBackward(&gHalfGPU, &yHalfGPU, &xHalfGPU, &dedyHalfGPU, &dedxHalfGPU, NULL, 1, CROSSENTROPY);
/* convert data type from float to float16 */
_ConvertDataType(&yHalfGPU, yGPU);
_ConvertDataType(&dedxHalfGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-3F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-3F);
/* destroy variables */
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -238,6 +400,26 @@ bool TestSoftmax() ...@@ -238,6 +400,26 @@ bool TestSoftmax()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSoftmax3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSoftmax4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/ */
#include "TSub.h" #include "TSub.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -109,8 +111,8 @@ bool TestSub1() ...@@ -109,8 +111,8 @@ bool TestSub1()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] dimSize; delete[] dimSize;
...@@ -214,6 +216,177 @@ bool TestSub2() ...@@ -214,6 +216,177 @@ bool TestSub2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: float16 tensor subtraction c = a - b * \beta */
bool TestSub3()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 5.0F, 8.0F},
{11.0F, 14.0F, 17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sub function */
_Sub(&aHalfGPU, &bHalfGPU, &cHalfGPU);
_SubMe(&cMeHalfGPU, &bHalfGPU);
cUserHalfGPU = Sub(aHalfGPU, bHalfGPU);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F) &&
cMeGPU->CheckData(answer, unitNum, 1e-4F) &&
cUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: float16 tensor subtraction c = a - b * \beta */
bool TestSub4()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {-0.5F, 1.5F, 3.5F, 5.5F},
{7.5F, 9.5F, 11.5F, 13.5F} };
float beta = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sub function */
_Sub(&aHalfGPU, &bHalfGPU, &cHalfGPU, beta);
_SubMe(&cMeHalfGPU, &bHalfGPU, beta);
cUserHalfGPU = Sub(aHalfGPU, bHalfGPU, beta);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F) &&
cMeGPU->CheckData(answer, unitNum, 1e-4F) &&
cUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -243,6 +416,24 @@ bool TestSub() ...@@ -243,6 +416,24 @@ bool TestSub()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSub3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSub4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -17,11 +17,13 @@ ...@@ -17,11 +17,13 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/ */
#include "TSubDim.h" #include "TSubDim.h"
#include "../core/arithmetic/SubDim.h" #include "../core/arithmetic/SubDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -32,6 +34,231 @@ i.e., a is subtracted with b by broadcasting ...@@ -32,6 +34,231 @@ i.e., a is subtracted with b by broadcasting
*/ */
bool TestSubDim1() bool TestSubDim1()
{ {
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
{5.0F, 6.0F, 7.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 0);
_SubDim(cMe, b, 0);
cUser = SubDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 0);
_SubDim(cMeGPU, bGPU, 0);
cUserGPU = SubDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
{3.0F, 6.0F, 7.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 1);
_SubDim(cMe, b, 1);
cUser = SubDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 1);
_SubDim(cMeGPU, bGPU, 1);
cUserGPU = SubDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: float16 tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim3()
{
/* a tensor of size (2, 4) */ /* a tensor of size (2, 4) */
int aOrder = 2; int aOrder = 2;
int * aDimSize = new int[aOrder]; int * aDimSize = new int[aOrder];
...@@ -60,29 +287,6 @@ bool TestSubDim1() ...@@ -60,29 +287,6 @@ bool TestSubDim1()
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 0);
_SubDim(cMe, b, 0);
cUser = SubDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -94,27 +298,41 @@ bool TestSubDim1() ...@@ -94,27 +298,41 @@ bool TestSubDim1()
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU; XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum); cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum); bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll(); cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sub function */ /* call sub function */
_SubDim(aGPU, bGPU, cGPU, 0); _SubDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_SubDim(cMeGPU, bGPU, 0); _SubDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserGPU = SubDim(*aGPU, *bGPU, 0); cUserHalfGPU = SubDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) && gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) && cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum); cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete cGPU; delete cGPU;
...@@ -125,10 +343,6 @@ bool TestSubDim1() ...@@ -125,10 +343,6 @@ bool TestSubDim1()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -137,11 +351,11 @@ bool TestSubDim1() ...@@ -137,11 +351,11 @@ bool TestSubDim1()
} }
/* /*
case 2: tensor subtraction c = a - b * \beta case 4: float16 tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a, where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting i.e., a is subtracted with b by broadcasting
*/ */
bool TestSubDim2() bool TestSubDim4()
{ {
/* a tensor of size (2, 4) */ /* a tensor of size (2, 4) */
int aOrder = 2; int aOrder = 2;
...@@ -173,29 +387,6 @@ bool TestSubDim2() ...@@ -173,29 +387,6 @@ bool TestSubDim2()
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 1);
_SubDim(cMe, b, 1);
cUser = SubDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
...@@ -207,27 +398,41 @@ bool TestSubDim2() ...@@ -207,27 +398,41 @@ bool TestSubDim2()
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0); XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU; XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */ /* Initialize variables */
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum); cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum); bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll(); cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sub function */ /* call sub function */
_SubDim(aGPU, bGPU, cGPU, 1); _SubDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SubDim(cMeGPU, bGPU, 1); _SubDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserGPU = SubDim(*aGPU, *bGPU, 1); cUserHalfGPU = SubDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */ /* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) && gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) && cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum); cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU; delete aGPU;
delete bGPU; delete bGPU;
delete cGPU; delete cGPU;
...@@ -238,10 +443,6 @@ bool TestSubDim2() ...@@ -238,10 +443,6 @@ bool TestSubDim2()
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -249,6 +450,7 @@ bool TestSubDim2() ...@@ -249,6 +450,7 @@ bool TestSubDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -257,41 +459,59 @@ TODO!! ...@@ -257,41 +459,59 @@ TODO!!
/* test for SubDim Function */ /* test for SubDim Function */
bool TestSubDim() bool TestSubDim()
{ {
XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n"); XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
caseFlag = TestSubDim1(); caseFlag = TestSubDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSubDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSubDim3();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 3 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 2 test */ /* case 4 test */
caseFlag = TestSubDim2(); caseFlag = TestSubDim4();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 4 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
*/ */
if (returnFlag) { if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n"); XPRINT(0, stdout, ">> All Passed!\n");
} }
else else
XPRINT(0, stdout, ">> Failed!\n"); XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n"); XPRINT(0, stdout, "\n");
return returnFlag; return returnFlag;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
*/ */
#include "TSum.h" #include "TSum.h"
...@@ -109,8 +110,8 @@ bool TestSum1() ...@@ -109,8 +110,8 @@ bool TestSum1()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] dimSize; delete[] dimSize;
...@@ -214,6 +215,514 @@ bool TestSum2() ...@@ -214,6 +215,514 @@ bool TestSum2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: float16 tensor summation c = a + b * \beta */
bool TestSum3()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
{-3.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensors */
XTensor halfAGPU;
XTensor halfBGPU;
XTensor halfCGPU;
XTensor halfCMeGPU;
XTensor halfCUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
halfAGPU = ConvertDataType(*aGPU, X_FLOAT16);
halfBGPU = ConvertDataType(*bGPU, X_FLOAT16);
halfCGPU = ConvertDataType(*cGPU, X_FLOAT16);
halfCMeGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sum function */
_Sum(&halfAGPU, &halfBGPU, &halfCGPU);
_SumMe(&halfCMeGPU, &halfBGPU);
halfCUserGPU = Sum(halfAGPU, halfBGPU);
/* convert data type from float16 to float */
_ConvertDataType(&halfCGPU, cGPU);
_ConvertDataType(&halfCMeGPU, cMeGPU);
cUserGPU = ConvertDataType(halfCUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: float16 tensor summation c = a + b * \beta */
bool TestSum4()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {0.5F, 0.5F, 0.5F, 0.5F},
{0.5F, 0.5F, 0.5F, 0.5F} };
float beta = 0.5F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensors */
XTensor halfAGPU;
XTensor halfBGPU;
XTensor halfCGPU;
XTensor halfCMeGPU;
XTensor halfCUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
halfAGPU = ConvertDataType(*aGPU, X_FLOAT16);
halfBGPU = ConvertDataType(*bGPU, X_FLOAT16);
halfCGPU = ConvertDataType(*cGPU, X_FLOAT16);
halfCMeGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call Sum function */
_Sum(&halfAGPU, &halfBGPU, &halfCGPU, beta);
_SumMe(&halfCMeGPU, &halfBGPU, beta);
halfCUserGPU = Sum(halfAGPU, halfBGPU, beta);
/* convert data type from float16 to float */
_ConvertDataType(&halfCGPU, cGPU);
_ConvertDataType(&halfCMeGPU, cMeGPU);
cUserGPU = ConvertDataType(halfCUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 5: int tensor summation c = a + b * \beta */
bool TestSum5()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
{-3.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create int tensors */
XTensor intAGPU;
XTensor intBGPU;
XTensor intCGPU;
XTensor intCMeGPU;
XTensor intCUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to int */
intAGPU = ConvertDataType(*aGPU, X_INT);
intBGPU = ConvertDataType(*bGPU, X_INT);
intCGPU = ConvertDataType(*cGPU, X_INT);
intCMeGPU = ConvertDataType(*cMeGPU, X_INT);
/* call Sum function */
_Sum(&intAGPU, &intBGPU, &intCGPU);
_SumMe(&intCMeGPU, &intBGPU);
intCUserGPU = Sum(intAGPU, intBGPU);
/* convert data type from int to float */
_ConvertDataType(&intCGPU, cGPU);
_ConvertDataType(&intCMeGPU, cMeGPU);
cUserGPU = ConvertDataType(intCUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 6: int tensor summation c = a + b * \beta */
bool TestSum6()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {2.0F, -1.0F, -4.0F, -7.0F},
{-10.0F, -13.0F, -16.0F, -19.0F} };
float beta = 2.0F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create int tensors */
XTensor intAGPU;
XTensor intBGPU;
XTensor intCGPU;
XTensor intCMeGPU;
XTensor intCUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to int */
intAGPU = ConvertDataType(*aGPU, X_INT);
intBGPU = ConvertDataType(*bGPU, X_INT);
intCGPU = ConvertDataType(*cGPU, X_INT);
intCMeGPU = ConvertDataType(*cMeGPU, X_INT);
/* call Sum function */
_Sum(&intAGPU, &intBGPU, &intCGPU, beta);
_SumMe(&intCMeGPU, &intBGPU, beta);
intCUserGPU = Sum(intAGPU, intBGPU, beta);
/* convert data type from int to float */
_ConvertDataType(&intCGPU, cGPU);
_ConvertDataType(&intCMeGPU, cMeGPU);
cUserGPU = ConvertDataType(intCUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 7: int8 tensor summation c = a + b * \beta */
bool TestSum7()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
{-3.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create int8 tensors */
XTensor int8AGPU;
XTensor int8BGPU;
XTensor int8CGPU;
XTensor int8CMeGPU;
XTensor int8CUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to int8 */
int8AGPU = ConvertDataType(*aGPU, X_INT8);
int8BGPU = ConvertDataType(*bGPU, X_INT8);
int8CGPU = ConvertDataType(*cGPU, X_INT8);
int8CMeGPU = ConvertDataType(*cMeGPU, X_INT8);
/* call Sum function */
_Sum(&int8AGPU, &int8BGPU, &int8CGPU);
_SumMe(&int8CMeGPU, &int8BGPU);
int8CUserGPU = Sum(int8AGPU, int8BGPU);
/* convert data type from int8 to float */
_ConvertDataType(&int8CGPU, cGPU);
_ConvertDataType(&int8CMeGPU, cMeGPU);
cUserGPU = ConvertDataType(int8CUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 8: int8 tensor summation c = a + b * \beta */
bool TestSum8()
{
/* a tensor of size (2, 4) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 2;
dimSize[1] = 4;
int unitNum = 1;
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {2.0F, -1.0F, -4.0F, -7.0F},
{-10.0F, -13.0F, -16.0F, -19.0F} };
float beta = 2.0F;
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create int8 tensors */
XTensor int8AGPU;
XTensor int8BGPU;
XTensor int8CGPU;
XTensor int8CMeGPU;
XTensor int8CUserGPU;
/* Initialize variables */
aGPU->SetData(aData, unitNum);
cMeGPU->SetData(aData, unitNum);
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* convert data type from float to int8 */
int8AGPU = ConvertDataType(*aGPU, X_INT8);
int8BGPU = ConvertDataType(*bGPU, X_INT8);
int8CGPU = ConvertDataType(*cGPU, X_INT8);
int8CMeGPU = ConvertDataType(*cMeGPU, X_INT8);
/* call Sum function */
_Sum(&int8AGPU, &int8BGPU, &int8CGPU, beta);
_SumMe(&int8CMeGPU, &int8BGPU, beta);
int8CUserGPU = Sum(int8AGPU, int8BGPU, beta);
/* convert data type from int8 to float */
_ConvertDataType(&int8CGPU, cGPU);
_ConvertDataType(&int8CMeGPU, cMeGPU);
cUserGPU = ConvertDataType(int8CUserGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, unitNum) &&
cMeGPU->CheckData(answer, unitNum) &&
cUserGPU.CheckData(answer, unitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -243,6 +752,60 @@ bool TestSum() ...@@ -243,6 +752,60 @@ bool TestSum()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSum3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSum5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestSum6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* case 8 test */
caseFlag = TestSum8();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 8 failed!\n");
}
else
XPRINT(0, stdout, ">> case 8 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#define __TEST_SUM_H__ #define __TEST_SUM_H__
#include "../core/arithmetic/Sum.h" #include "../core/arithmetic/Sum.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -17,12 +17,14 @@ ...@@ -17,12 +17,14 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
*/ */
#include "TSumDim.h" #include "TSumDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/arithmetic/SumDim.h" #include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h" #include "../core/getandset/SetData.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -128,8 +130,8 @@ bool TestSumDim1() ...@@ -128,8 +130,8 @@ bool TestSumDim1()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -242,8 +244,8 @@ bool TestSumDim2() ...@@ -242,8 +244,8 @@ bool TestSumDim2()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete[] aDimSize; delete[] aDimSize;
delete[] bDimSize; delete[] bDimSize;
...@@ -351,8 +353,8 @@ bool TestSumDim3() ...@@ -351,8 +353,8 @@ bool TestSumDim3()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete answer; delete answer;
delete[] aDimSize; delete[] aDimSize;
...@@ -460,8 +462,8 @@ bool TestSumDim4() ...@@ -460,8 +462,8 @@ bool TestSumDim4()
#else #else
/* destroy variables */ /* destroy variables */
delete a; delete a;
delete b; delete b;
delete c; delete c;
delete cMe; delete cMe;
delete answer; delete answer;
delete[] aDimSize; delete[] aDimSize;
...@@ -471,6 +473,310 @@ bool TestSumDim4() ...@@ -471,6 +473,310 @@ bool TestSumDim4()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 5: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2) = (2, 4), n = 0.
*/
bool TestSumDim5()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{3.0F, 4.0F, 5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 0);
_SumDim(&cMeHalfGPU, &bHalfGPU, 0);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 0);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2, 2) = (2, 4), n = 1.
*/
bool TestSumDim6()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, 1.0F, 4.0F},
{5.0F, 4.0F, 5.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SumDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 7: float16 tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim7()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * answer = NewTensor(aOrder, aDimSize);
/* initialize variables */
_SetDataFixed(answer, 1.0F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* create float16 tensor */
XTensor aHalfGPU;
XTensor bHalfGPU;
XTensor cHalfGPU;
XTensor cMeHalfGPU;
XTensor cUserHalfGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMeGPU->SetZeroAll();
_SetDataFixed(bGPU, 1.0F);
/* convert data type from float to float16 */
aHalfGPU = ConvertDataType(*aGPU, X_FLOAT16);
bHalfGPU = ConvertDataType(*bGPU, X_FLOAT16);
cHalfGPU = ConvertDataType(*cGPU, X_FLOAT16);
cMeHalfGPU = ConvertDataType(*cMeGPU, X_FLOAT16);
/* call sum function */
_SumDim(&aHalfGPU, &bHalfGPU, &cHalfGPU, 1);
_SumDim(&cMeHalfGPU, &bHalfGPU, 1);
cUserHalfGPU = SumDim(aHalfGPU, bHalfGPU, 1);
/* convert data type from float16 to float */
_ConvertDataType(&cHalfGPU, cGPU);
_ConvertDataType(&cMeHalfGPU, cMeGPU);
cUserGPU = ConvertDataType(cUserHalfGPU, X_FLOAT);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -518,6 +824,33 @@ bool TestSumDim() ...@@ -518,6 +824,33 @@ bool TestSumDim()
//else //else
// XPRINT(0, stdout, ">> case 4 passed!\n"); // XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSumDim5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestSumDim6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestSumDim7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "../XList.h" #include "../XList.h"
#include "TUnsqueeze.h" #include "TUnsqueeze.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -153,6 +154,128 @@ bool TestUnsqueeze1() ...@@ -153,6 +154,128 @@ bool TestUnsqueeze1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: float16 insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
In this case,
(2, 3) -> (2, 2, 3), dim=1, dSize=2
(2, 3) -> (2, 3, 2), dim=2, dSize=2
*/
bool TestUnsqueeze2()
{
/* a source tensor of size (2, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 2;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a target tensor of size (2, 2, 3) */
int tOrder1 = 3;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 2;
tDimSize1[1] = 2;
tDimSize1[2] = 3;
int tUnitNum1 = 1;
for (int i = 0; i < tOrder1; i++)
tUnitNum1 *= tDimSize1[i];
/* a target tensor of size (2, 3, 2) */
int tOrder2 = 3;
int * tDimSize2 = new int[tOrder2];
tDimSize2[0] = 2;
tDimSize2[1] = 3;
tDimSize2[2] = 2;
int tUnitNum2 = 1;
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
{3.0F, 4.0F, 5.0F} };
DTYPE answer1[2][2][3] = { { {0.0F, 1.0F, 2.0F},
{0.0F, 1.0F, 2.0F} },
{ {3.0F, 4.0F, 5.0F},
{3.0F, 4.0F, 5.0F} } };
DTYPE answer2[2][3][2] = { { {0.0F, 0.0F},
{1.0F, 1.0F},
{2.0F, 2.0F} },
{ {3.0F, 3.0F},
{4.0F, 4.0F},
{5.0F, 5.0F} } };
/* CPU test */
bool cpuTest = true;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* create float16 tensor */
XTensor sHalfGPU;
XTensor tHalfGPU1;
XTensor tHalfGPU2;
XTensor tUserHalfGPU1;
XTensor tUserHalfGPU2;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* convert data type from float to float16 */
sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
tHalfGPU1 = ConvertDataType(*tGPU1, X_FLOAT16);
tHalfGPU2 = ConvertDataType(*tGPU2, X_FLOAT16);
/* call unsqueeze function */
_Unsqueeze(&sHalfGPU, &tHalfGPU1, 1, 2);
_Unsqueeze(&sHalfGPU, &tHalfGPU2, 2, 2);
tUserHalfGPU1 = Unsqueeze(sHalfGPU, 1, 2);
tUserHalfGPU2 = Unsqueeze(sHalfGPU, 2, 2);
/* convert data type from float16 to float */
_ConvertDataType(&tHalfGPU1, tGPU1);
_ConvertDataType(&tHalfGPU2, tGPU2);
tUserGPU1 = ConvertDataType(tUserHalfGPU1, X_FLOAT);
tUserGPU2 = ConvertDataType(tUserHalfGPU2, X_FLOAT);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) &&
tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) &&
tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete sGPU;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete[] sDimSize;
delete[] tDimSize1;
delete[] tDimSize2;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -174,6 +297,16 @@ bool TestUnsqueeze() ...@@ -174,6 +297,16 @@ bool TestUnsqueeze()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestUnsqueeze2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -29,63 +29,64 @@ bool Test() ...@@ -29,63 +29,64 @@ bool Test()
bool wrong = false; bool wrong = false;
XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n"); XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
wrong = !TestAbsolute() || wrong; //wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong; //wrong = !TestClip() || wrong;
wrong = !TestCompare() || wrong; //wrong = !TestCompare() || wrong;
wrong = !TestConcatenate() || wrong; //wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong; //wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong; //wrong = !TestCos() || wrong;
//wrong = !TestConvertDataType() || wrong; //wrong = !TestConvertDataType() || wrong;
wrong = !TestCopyIndexed() || wrong; //wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong; //wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong; //wrong = !TestDiv() || wrong;
wrong = !TestDivDim() || wrong; //wrong = !TestDivDim() || wrong;
wrong = !TestExp() || wrong; //wrong = !TestExp() || wrong;
wrong = !TestGather() || wrong; wrong = !TestGather() || wrong;
wrong = !TestLog() || wrong; //wrong = !TestLog() || wrong;
wrong = !TestMatrixMul() || wrong; //wrong = !TestMatrixMul() || wrong;
wrong = !TestMatrixMul2D() || wrong; //wrong = !TestMatrixMul2D() || wrong;
wrong = !TestMatrixMul2DParallel() || wrong; //wrong = !TestMatrixMul2DParallel() || wrong;
wrong = !TestMatrixMulBatched() || wrong; //wrong = !TestMatrixMulBatched() || wrong;
wrong = !TestMerge() || wrong; //wrong = !TestMerge() || wrong;
wrong = !TestMultiply() || wrong; wrong = !TestMultiply() || wrong;
wrong = !TestMultiplyDim() || wrong; //wrong = !TestMultiplyDim() || wrong;
wrong = !TestNegate() || wrong; //wrong = !TestNegate() || wrong;
wrong = !TestNormalize() || wrong; //wrong = !TestNormalize() || wrong;
wrong = !TestPower() || wrong; //wrong = !TestPower() || wrong;
wrong = !TestReduceMax() || wrong; //wrong = !TestReduceMax() || wrong;
wrong = !TestReduceMean() || wrong; //wrong = !TestReduceMean() || wrong;
wrong = !TestReduceSum() || wrong; //wrong = !TestReduceSum() || wrong;
wrong = !TestReduceSumAll() || wrong; //wrong = !TestReduceSumAll() || wrong;
wrong = !TestReduceSumSquared() || wrong; //wrong = !TestReduceSumSquared() || wrong;
wrong = !TestReduceVariance() || wrong; //wrong = !TestReduceVariance() || wrong;
wrong = !TestRound() || wrong; //wrong = !TestRound() || wrong;
wrong = !TestScaleAndShift() || wrong; //wrong = !TestScaleAndShift() || wrong;
wrong = !TestSelect() || wrong; //wrong = !TestSelect() || wrong;
wrong = !TestSetAscendingOrder() || wrong; //wrong = !TestSetAscendingOrder() || wrong;
wrong = !TestSetData() || wrong; //wrong = !TestSetData() || wrong;
wrong = !TestSign() || wrong; //wrong = !TestSign() || wrong;
wrong = !TestSin() || wrong; //wrong = !TestSin() || wrong;
wrong = !TestSort() || wrong; //wrong = !TestSort() || wrong;
wrong = !TestSplit() || wrong; //wrong = !TestSplit() || wrong;
wrong = !TestSpread() || wrong; //wrong = !TestSpread() || wrong;
wrong = !TestSub() || wrong; //wrong = !TestSub() || wrong;
wrong = !TestSum() || wrong; //wrong = !TestSubDim() || wrong;
wrong = !TestSumDim() || wrong; //wrong = !TestSum() || wrong;
wrong = !TestTan() || wrong; //wrong = !TestSumDim() || wrong;
wrong = !TestTranspose() || wrong; //wrong = !TestTan() || wrong;
//wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong; //wrong = !TestTopK() || wrong;
wrong = !TestUnsqueeze() || wrong; wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong; //wrong = !TestXMem() || wrong;
//
wrong = !TestCrossEntropy() || wrong; wrong = !TestCrossEntropy() || wrong;
wrong = !TestDropout() || wrong; //wrong = !TestDropout() || wrong;
wrong = !TestHardTanH() || wrong; //wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity() || wrong; //wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong; //wrong = !TestLogSoftmax() || wrong;
wrong = !TestLoss() || wrong; //wrong = !TestLoss() || wrong;
wrong = !TestRectify() || wrong; //wrong = !TestRectify() || wrong;
wrong = !TestSigmoid() || wrong; //wrong = !TestSigmoid() || wrong;
wrong = !TestSoftmax() || wrong; wrong = !TestSoftmax() || wrong;
/* other test */ /* other test */
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
#include "TSplit.h" #include "TSplit.h"
#include "TSpread.h" #include "TSpread.h"
#include "TSub.h" #include "TSub.h"
#include "TSubDim.h"
#include "TSum.h" #include "TSum.h"
#include "TSumDim.h" #include "TSumDim.h"
#include "TTan.h" #include "TTan.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论