Commit 1da50ae2 by ltb

using cpu float16 and test fnn and t2t times

parent 29d2352b
......@@ -15,9 +15,9 @@
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
#include <stdio.h>
#include "XNet.h"
......@@ -28,190 +28,996 @@
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
using namespace nts;
using namespace fnnlm;
using namespace transformer;
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
void BackwardTest();
void TransposeTest();
void SumDimTest();
//void SplitBackwardTest();
void MemTest();
//void xcTest();
void ConvertDataTypeTest();
void ConvertDataTypeBackwardTest();
void SumFP16Test();
void GatherFP16Test();
void HardTanHFP16Test();
void ReduceMaxFP16Test();
void ReduceSumFP16Test();
void LogSoftmaxFP16Test();
void ClipFP16Test();
void ScaleAndShiftFP16Test();
void InitTensorFP16Test();
void MultiplyDimTime();
void TimeTestGemm();
void TimeTest();
void TimeInt8AndFloat32();
void TestCPUhalf();
int main(int argc, const char ** argv)
{
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if (argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if (argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else {
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
}
//xcTest();
//return 0;
//MemTest();
//return 0;
//SplitBackwardTest();
//return 0;
//_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
//Test();
//return 0;
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
//InitTensorFP16Test();
//return 0;
//_CrtDumpMemoryLeaks();
return 0;
}
void TestCPUhalf() {
int memSize = 1024;
int devId = 0;
int dim1 = 1024;
int dim2 = 32;
XMem * mem;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D(&a, dim1, dim1, X_FLOAT, devId);
InitTensor2D(&b, dim2, dim2, X_FLOAT, devId);
InitTensor2D(&c, dim1, dim1, X_FLOAT, devId);
}
void TimeInt8AndFloat32() {
XMem * mem;
int memSize = 1024;
int devId = 2;
int dim = 512;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&b, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&c, dim, dim, X_FLOAT, devId, mem);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
XTensor inta;
XTensor intb;
XTensor intc;
InitTensor2D(&inta, dim, dim, X_INT, devId, mem);
InitTensor2D(&intb, dim, dim, X_INT, devId, mem);
InitTensor2D(&intc, dim, dim, X_FLOAT, devId, mem);
XTensor tmp;
InitTensor2D(&tmp, dim, dim, X_FLOAT, devId, mem);
tmp.SetDataRand(-100000.0F, 100000.0F);
inta = ConvertDataType(tmp, X_INT8);
intb = ConvertDataType(tmp, X_INT8);
int repeat = 10000;
printf("test on matrixmul\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.2fs\n", elapsed_matrixmul32);
double start_int8 = GetClockSec();
for (int i = 0; i < repeat; i++) {
_MatrixMul(&inta, X_NOTRANS, &intb, X_NOTRANS, &intc);
}
double elapsed_int8 = GetClockSec() - start_int8;
printf("elapsed_int8=%.2fs\n", elapsed_int8);
}
void TimeTest() {
XMem * mem;
int memSize = 1024;
int devId = 0;
int dim = 512;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor2D(&a, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&b, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&c, dim, dim, X_FLOAT, devId, mem);
InitTensor2D(&halfc, dim, dim, X_FLOAT16, devId, mem);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
int repeat = 100000;
printf("=========================================\n");
printf("test on sum\n");
double start_sum32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Sum(&a, &b);
}
double elapsed_sum32 = GetClockSec() - start_sum32;
printf("elapsed_sum32=%.2fs\n", elapsed_sum32);
double start_sum16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Sum(&halfa, &halfb);
}
double elapsed_sum16 = GetClockSec() - start_sum16;
printf("elapsed_sum16=%.2fs\n", elapsed_sum16);
printf("=========================================\n");
/*printf("test on sub\n");
double start_sub32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Sub(&a, &b);
}
double elapsed_sub32 = GetClockSec() - start_sub32;
printf("elapsed_sub32=%.2fs\n", elapsed_sub32);
double start_sub16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Sub(&halfa, &halfb);
}
double elapsed_sub16 = GetClockSec() - start_sub16;
printf("elapsed_sub16=%.2fs\n", elapsed_sub16);
printf("=========================================\n");*/
/*printf("test on div\n");
double start_div32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Div(&a, &b);
}
double elapsed_div32 = GetClockSec() - start_div32;
printf("elapsed_div32=%.2fs\n", elapsed_div32);
double start_div16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Div(&halfa, &halfb);
}
double elapsed_div16 = GetClockSec() - start_div16;
printf("elapsed_div16=%.2fs\n", elapsed_div16);
printf("=========================================\n");*/
/*printf("test on multiply\n");
double start_multiply32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Multiply(&a, &b);
}
double elapsed_multiply32 = GetClockSec() - start_multiply32;
printf("elapsed_multiply32=%.2fs\n", elapsed_multiply32);
double start_multiply16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Multiply(&halfa, &halfb);
}
double elapsed_multiply16 = GetClockSec() - start_multiply16;
printf("elapsed_multiply16=%.2fs\n", elapsed_multiply16);
printf("=========================================\n");*/
printf("test on scaleandshift\n");
double start_scaleandshift32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ScaleAndShift(&a, 1, 0);
}
double elapsed_scaleandshift32 = GetClockSec() - start_scaleandshift32;
printf("elapsed_scaleandshift32=%.2fs\n", elapsed_scaleandshift32);
double start_scaleandshift16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ScaleAndShift(&halfa, 1, 0);
}
double elapsed_scaleandshift16 = GetClockSec() - start_scaleandshift16;
printf("elapsed_scaleandshift16=%.2fs\n", elapsed_scaleandshift16);
printf("=========================================\n");
printf("test on reducesum\n");
double start_reducesum32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ReduceSum(&a, 1);
}
double elapsed_reducesum32 = GetClockSec() - start_reducesum32;
printf("elapsed_reducesum32=%.2fs\n", elapsed_reducesum32);
double start_reducesum16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ReduceSum(&halfa, 1);
}
double elapsed_reducesum16 = GetClockSec() - start_reducesum16;
printf("elapsed_reducesum16=%.2fs\n", elapsed_reducesum16);
printf("=========================================\n");
printf("test on reducemax\n");
double start_reducemax32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = ReduceMax(&a, 1);
}
double elapsed_reducemax32 = GetClockSec() - start_reducemax32;
printf("elapsed_reducemax32=%.2fs\n", elapsed_reducemax32);
double start_reducemax16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = ReduceMax(&halfa, 1);
}
double elapsed_reducemax16 = GetClockSec() - start_reducemax16;
printf("elapsed_reducemax16=%.2fs\n", elapsed_reducemax16);
printf("=========================================\n");
printf("test on logsoftmax\n");
double start_logsoftmax32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = LogSoftmax(&a, 1);
}
double elapsed_logsoftmax32 = GetClockSec() - start_logsoftmax32;
printf("elapsed_logsoftmax32=%.2fs\n", elapsed_logsoftmax32);
double start_logsoftmax16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = LogSoftmax(&halfa, 1);
}
double elapsed_logsoftmax16 = GetClockSec() - start_logsoftmax16;
printf("elapsed_logsoftmax16=%.2fs\n", elapsed_logsoftmax16);
printf("=========================================\n");
printf("test on matrixmul\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = MatrixMul(&a, &b);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.2fs\n", elapsed_matrixmul32);
double start_matrixmul16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = MatrixMul(&halfa, &halfb);
}
double elapsed_matrixmul16 = GetClockSec() - start_matrixmul16;
printf("elapsed_matrixmul16=%.2fs\n", elapsed_matrixmul16);
printf("=========================================\n");
printf("test on convert\n");
double start_convert32to16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfa = ConvertDataType(a, X_FLOAT16);
}
double elapsed_convert32to16 = GetClockSec() - start_convert32to16;
printf("elapsed_convert32to16=%.2fs\n", elapsed_convert32to16);
double start_convert16to32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
a = ConvertDataType(halfa, X_FLOAT);
}
double elapsed_convert16to32 = GetClockSec() - start_convert16to32;
printf("elapsed_convert16to32=%.2fs\n", elapsed_convert16to32);
printf("=========================================\n");
delete mem;
}
void MultiplyDimTime() {
int memSize = 1024;
int devId = 0;
int dim1 = 1024;
int dim2 = 32;
XMem * mem;
mem = new XMem(devId, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devId, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D(&a, dim1, dim1, X_FLOAT, devId);
InitTensor2D(&b, dim2, dim2, X_FLOAT, devId);
InitTensor2D(&c, dim1, dim1, X_FLOAT, devId);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRandn(-1.0F, 1.0F);
int repeat = 2000;
printf("test on MultiplyDim\n");
double start = GetClockSec();
for (int j = 0; j <= repeat; j++) {
c = MultiplyDim(&a, &b, 0);
}
double elapsed = GetClockSec() - start;
printf("elapsed_MultiplyDim32=%.4fs \n", elapsed);
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor2D(&halfc, dim1, dim1, X_FLOAT16, devId, mem);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
double starthalf = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = MultiplyDim(&halfa, &halfb, 0);
}
double elapsedhalf = GetClockSec() - starthalf;
printf("elapsed_MultiplyDim16=%.4fs\n", elapsedhalf);
}
void TimeTestGemm() {
XMem * mem;
int memSize = 1024;
delete mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(0, 0, (MTYPE)memSize * MILLION);
XTensor a;
XTensor b;
XTensor c;
XTensor halfa;
XTensor halfb;
XTensor halfc;
int dim1 = 512;
int dim2 = 1024;
//InitTensor3D(&a, 86, 48, 256, X_FLOAT, 0, mem);
//InitTensor2D(&b, 256, 256, X_FLOAT, 0, mem);
//InitTensor4D(&a, 8, 86, 48, 48, X_FLOAT, 0, mem);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0, mem);
InitTensor2D(&a, dim1, dim2, X_FLOAT, 0, mem);
InitTensor2D(&b, dim1, dim2, X_FLOAT, 0, mem);
//InitTensor4D(&a, 8, 86, 48, 32, X_FLOAT, 0);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
b.SetDataRand(-1.0F, 1.0F);
halfa = ConvertDataType(a, X_FLOAT16);
halfb = ConvertDataType(b, X_FLOAT16);
//a.Dump(&a, stderr, "a:", 10);
//b.Dump(&b, stderr, "b:", 10);
//halfa.Dump(&a, stderr, "halfa:", 10);
//halfb.Dump(&b, stderr, "halfb:", 10);
int repeat = 10000;
printf("=========================================\n");
double start_matrixmul16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = BMMul(halfa, X_NOTRANS, halfb, X_TRANS);
}
double elapsed_matrixmul16 = GetClockSec() - start_matrixmul16;
printf("elapsed_matrixmul16=%.4fs\n", elapsed_matrixmul16);
printf("------------------------------------------\n");
double start_matrixmul32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = BMMul(a, X_NOTRANS, b, X_TRANS);
}
double elapsed_matrixmul32 = GetClockSec() - start_matrixmul32;
printf("elapsed_matrixmul32=%.4fs\n", elapsed_matrixmul32);
printf("=========================================\n");
c.Dump(&c, stderr, "c:", 10);
halfc.Dump(&halfc, stderr, "halfc:", 10);
}
void InitTensorFP16Test() {
XTensor a;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
XTensor halfA;
halfA = ConvertDataType(a, X_FLOAT16);
halfA.Dump(&halfA, stderr, "halfA:");
XTensor b;
InitTensor2D(&b, 1, 10, X_FLOAT16, 0);
_SetDataRand(&b, -10.0F, 10.0F);
b.Dump(&b, stderr, "b:");
}
void ScaleAndShiftFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
int main( int argc, const char ** argv )
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = ScaleAndShift(intA, 2, 0);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void ClipFP16Test() {
XTensor a;
XTensor intA;
XTensor b;
XTensor intB;
InitTensor2D(&a, 1, 10, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
intA = ConvertDataType(a, X_INT);
intB = Clip(intA, -1, 1);
b = ConvertDataType(intB, X_FLOAT);
b.Dump(stderr, "b:");
}
void LogSoftmaxFP16Test() {
XTensor a;
XTensor halfA;
XTensor b;
XTensor halfB;
InitTensor3D(&a, 2, 2, 2, X_FLOAT, 0);
a.SetDataRand(-1.0F, 1.0F);
halfA = ConvertDataType(a, X_FLOAT16);
b = LogSoftmax(a, 1);
halfB = LogSoftmax(halfA, 1);
b.Dump(stderr, "sum:");
halfB.Dump(&halfB, stderr, "halfSum:");
}
void ReduceSumFP16Test()
{
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
//_CrtDumpMemoryLeaks();
return 0;
XTensor a;
XTensor sum;
XTensor halfA;
XTensor halfSum;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
sum = ReduceSum(a, 1);
halfSum = ReduceSum(halfA, 1);
sum.Dump(stderr, "sum:");
halfSum.Dump(&halfSum, stderr, "halfSum:");
}
void ReduceMaxFP16Test()
{
XTensor a;
XTensor max;
XTensor halfA;
XTensor halfMax;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
max = ReduceMax(a, 1);
halfMax = ReduceMax(halfA, 1);
max.Dump(stderr, "max:");
halfMax.Dump(&halfMax, stderr, "halfMax:");
}
void HardTanHFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = HardTanH(a);
halfB = HardTanH(halfA);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void GatherFP16Test() {
XTensor a;
XTensor b;
XTensor srcIndex;
XTensor halfA;
XTensor halfB;
XTensor c;
InitTensor1D(&srcIndex, 2, X_INT, 0);
int m = 0;
int n = 1;
srcIndex.Set1DInt(m, 0);
srcIndex.Set1DInt(n, 1);
InitTensor2D(&a, 3, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
InitTensor2D(&halfB, 2, 2, X_FLOAT16, 0);
a.SetDataRand(-5.0F, 5.0F);
halfA = ConvertDataType(a, X_FLOAT16);
a.Dump(stderr, "a:");
_Gather(&a, &b, &srcIndex);
b.Dump(stderr, "b:");
_Gather(&halfA, &halfB, &srcIndex);
c = ConvertDataType(halfB, X_FLOAT);
c.Dump(stderr, "c:");
}
void SumFP16Test()
{
XTensor a;
XTensor b;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 5, 5, X_FLOAT, 0);
InitTensor2D(&b, 5, 5, X_FLOAT, 0);
a.SetDataRand(-1.0F, 4.0F);
b.SetDataRand(-1.0F, 4.0F);
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
b = Sum(a, b, -0.4F);
halfB = Sum(halfA, halfB, -0.4F);
b.Dump(stderr, "b:");
halfB.Dump(&halfB, stderr, "halfB:");
}
void ConvertDataTypeTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
XTensor halfa;
InitTensor2D(&halfa, 2, 2, X_FLOAT16, 0);
XTensor a1;
InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
a.Dump(stderr, "a:");
halfa = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfa, X_FLOAT);
a1.Dump(stderr, "halfa:");
}
}
void ConvertDataTypeBackwardTest()
{
int rnum = 0;
for (int i = 0; i <= rnum; i++)
{
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
a.SetDataRand(2.0F, 2.0F);
a.Dump(stderr, "a:");
XTensor halfA;
XTensor a1;
halfA = ConvertDataType(a, X_FLOAT16);
a1 = ConvertDataType(halfA, X_FLOAT);
a1.grad = NewTensor(&a1);
a1.grad->SetDataRand(3.0F, 3.0F);
a1.grad->Dump(stderr, "a1.grad:");
XNet testBackward;
printf("1");
testBackward.Backward(a1);
printf("2");
halfA.grad->Dump(stderr, "halfA.grad:");
a.grad->Dump(stderr, "a.grad:");
}
}
//XTensor * stack(XList& list, int leadingDim)
//{
// size_t size = list.count;
// if (list.count == 0)
// return NULL;
// XTensor * sample = (XTensor*)list.Get(0);
//
// XTensor merge_tensor;
// int order = sample->order;
// int * dim = new int[order];
// for (int i = 0; i < order; i++)
// dim[i] = sample->GetDim(i);
// dim[leadingDim] *= size;
//
// InitTensor(&merge_tensor, order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
//
// _Merge(&list, &merge_tensor, leadingDim);
// delete[] dim;
//
// order += 1;
// dim = new int[order];
// dim[0] = size;
// for (size_t i = 1; i < order; i++) {
// if (i != leadingDim)
// dim[i] = sample->GetDim(i - 1);
// else
// dim[i] = sample->GetDim(i - 1) / size;
// }
//
// XTensor * split_tensor = new XTensor(order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
// _Split(&merge_tensor, split_tensor, leadingDim, size);
// delete[] dim;
//
// return split_tensor;
//}
//void xcTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// _SetDataFixed(&t1, 1.0F);
// _SetDataFixed(&t2, 2.0F);
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&t1);
// smalls.Add(&t2);
//
// XTensor* result = stack(smalls, 0);
// result->Dump(stderr, "", 100);
//}
void BackwardTest()
{
XNet net;
XNet net;
XTensor a;
XTensor b;
XTensor c;
a.enableGrad = true;
b.enableGrad = false;
c.enableGrad = false;
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
XTensor a;
XTensor b;
XTensor c;
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
DivDim(a, b, c, 0);
c.Dump(stderr, "c:");
auto loss = CrossEntropy(c, a);
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
//XLink::ShowNetwork(stderr, &c);
//XLink::ShowNetwork(stderr, &c);
net.Backward(loss);
net.Backward(c);
a.grad->Dump(stderr);
net.Dump(stderr);
}
void TransposeTest()
{
#ifdef USE_CUDA
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x;
XTensor y;
XTensor z;
int loops = 2000;
int B = 3 * 2 * 4;
int K = 8 * 1;
int N = 50;
int H = 512 * 4;
int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
InitTensor3D(&z, B, N, H, X_FLOAT, 0);
cudaEvent_t ctime0;
cudaEvent_t ctime1;
cudaEvent_t ctime2;
cudaEvent_t ctime3;
cudaEvent_t ctime4;
cudaEvent_t ctime5;
float elapsedSplit = 0.0;
float elapsedMerge = 0.0;
float elapsedSum = 0.0;
cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3);
cudaEventCreate(&ctime4);
cudaEventCreate(&ctime5);
cudaEventRecord(ctime0, 0);
double time0 = GetClock();
for(int i = 0; i < loops; i++)
_Split(&x, &y, 2, K);
double time1 = GetClock();
cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0);
double time2 = GetClock();
for(int i = 0; i < loops; i++)
_Merge(&y, &x, 3);
double time3 = GetClock();
cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
cudaEventRecord(ctime4, 0);
double time4 = GetClock();
for(int i = 0; i < loops; i++)
_Sum(&x, &z, &x);
double time5 = GetClock();
cudaEventRecord(ctime5, 0);
cudaEventSynchronize(ctime5);
cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x;
XTensor y;
XTensor z;
int loops = 2000;
int B = 3 * 2 * 4;
int K = 8 * 1;
int N = 50;
int H = 512 * 4;
int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H / K, X_FLOAT, 0);
InitTensor3D(&z, B, N, H, X_FLOAT, 0);
cudaEvent_t ctime0;
cudaEvent_t ctime1;
cudaEvent_t ctime2;
cudaEvent_t ctime3;
cudaEvent_t ctime4;
cudaEvent_t ctime5;
float elapsedSplit = 0.0;
float elapsedMerge = 0.0;
float elapsedSum = 0.0;
cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3);
cudaEventCreate(&ctime4);
cudaEventCreate(&ctime5);
cudaEventRecord(ctime0, 0);
double time0 = GetClock();
for (int i = 0; i < loops; i++)
_Split(&x, &y, 2, K);
double time1 = GetClock();
cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0);
double time2 = GetClock();
for (int i = 0; i < loops; i++)
_Merge(&y, &x, 3);
double time3 = GetClock();
cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
cudaEventRecord(ctime4, 0);
double time4 = GetClock();
for (int i = 0; i < loops; i++)
_Sum(&x, &z, &x);
double time5 = GetClock();
cudaEventRecord(ctime5, 0);
cudaEventSynchronize(ctime5);
cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
#endif
}
void SumDimTest()
{
XTensor x;
XTensor y;
XTensor z;
XTensor x;
XTensor y;
XTensor z;
int a = 5;
int b = 7;
int c = 3;
int a = 5;
int b = 7;
int c = 3;
InitTensor3D(&x, a, b, c, X_FLOAT, -1);
InitTensor1D(&y, c, X_FLOAT, -1);
InitTensor3D(&z, a, b, c, X_FLOAT, -1);
InitTensor3D(&x, a, b, c, X_FLOAT, -1);
InitTensor1D(&y, c, X_FLOAT, -1);
InitTensor3D(&z, a, b, c, X_FLOAT, -1);
x.SetZeroAll();
y.SetZeroAll();
z.SetZeroAll();
x.SetZeroAll();
y.SetZeroAll();
z.SetZeroAll();
DTYPE * data = new DTYPE[x.unitNum];
DTYPE * data = new DTYPE[x.unitNum];
for(int i = 0; i < x.unitNum; i++)
data[i] = (DTYPE)i;
x.SetData(data, x.unitNum);
for (int i = 0; i < x.unitNum; i++)
data[i] = (DTYPE)i;
x.SetData(data, x.unitNum);
for(int i = 0; i < y.unitNum; i++)
data[i] = -(DTYPE)i;
y.SetData(data, y.unitNum);
for (int i = 0; i < y.unitNum; i++)
data[i] = -(DTYPE)i;
y.SetData(data, y.unitNum);
_SumDim(&x, &y, &z, 2);
_SumDim(&x, &y, &z, 2);
z.Dump(stderr, "z:");
z.Dump(stderr, "z:");
delete[] data;
delete[] data;
}
//void SplitBackwardTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// //_SetDataFixedFloat(&t1, 1.0F);
// //_SetDataFixedFloat(&t2, 2.0F);
// t1.SetDataRand();
// t2.SetDataRand();
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&first);
// smalls.Add(&second);
//
// Split(tensor, smalls, 1, 2);
//
// XTensor mul;
// mul = Sum(first, second);
//
// XNet net;
// net.Backward(mul);
// net.Dump(stderr);
//
// printf("Done!");
//}
void MemTest()
{
XMem * mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
XTensor tensor;
InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
tensor.SetZeroAll();
tensor.Dump(stderr);
delete mem;
if (tensor.mem != NULL) {
printf("It isn't null!\n");
printf("%d\n", (int)tensor.mem->signature);
}
else {
printf("It's null\n");
}
tensor.Dump(stderr);
}
\ No newline at end of file
......@@ -415,7 +415,19 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
XNet autoDiffer;
double startT = GetClockSec();
double mkinput = 0.0;
double mkgold = 0.0;
double train_time = 0.0;
double clearModel = 0.0;
double forward=0.0;
double backward = 0.0;
double update = 0.0;
double end = 0.0;
double start = 0.0;
double time;
/* iterate for a number of epochs */
for(epoch = 0; epoch < nEpoch; epoch++){
......@@ -426,7 +438,6 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
wordCount = 0;
loss = 0;
ngramNum = 1;
while(ngramNum > 0){
/* load a minibatch of ngrams */
......@@ -447,20 +458,25 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the loss tensor */
XTensor lossTensor;
start = GetClockSec();
/* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
mkinput += GetClockSec() - start;
start = GetClockSec();
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
mkgold += GetClockSec() - start;
time = GetClockSec();
if(!autoDiff){
/* prepare an empty network for building the fnn */
FNNNet net;
/* gradident = 0 */
Clear(grad, false);
/* forward computation */
Forward(inputs, output, model, net);
......@@ -475,40 +491,60 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
loss -= prob;
}
else{
start = GetClockSec();
/* gradient = 0 */
Clear(model, true);
clearModel += GetClockSec() - start;
start = GetClockSec();
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
forward += GetClockSec() - start;
start = GetClockSec();
/* this is implemented by multiply function */
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */
autoDiffer.Backward(lossTensor);
backward += GetClockSec() - start;
start = GetClockSec();
/* update model parameters */
Update(model, grad, learningRate, true);
update += GetClockSec() - start;
start = GetClockSec();
/* get probabilities */
float prob = ReduceSumAll(lossTensor);
loss += prob;
end += GetClockSec() - start;
}
train_time += GetClockSec() - time;
wordCount += ngramNum;
wordCountTotal += ngramNum;
if(++step >= nStep){
isEnd = true;
break;
}
if (step % 100 == 0) {
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
startT = GetClockSec();
XPRINT8(0, stderr, "[Time] mkinput=%.5lfs,mkgold=%.5lfs,train_time=%.5lfs,clearModel=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf, end=%.5lf\n",
mkinput, mkgold, train_time, clearModel, forward, backward, update,end);
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
mkinput = 0.0;
mkgold = 0.0;
train_time = 0.0;
clearModel = 0.0;
forward = 0.0;
backward = 0.0;
update = 0.0;
end = 0.0;
}
}
......
......@@ -148,6 +148,14 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
double startT = GetClockSec();
double mkinput = 0.0;
double train_time = 0.0;
double forward = 0.0;
double backward = 0.0;
double update = 0.0;
double start = 0.0;
double time = 0.0;
for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32
if(isShuffled)
......@@ -176,18 +184,31 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */
XTensor goldSmoothed;
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
{
//while (batchLoader.LoadBatch(file, model->isLM,
// &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
// NULL, vSize, vSizeTgt,
// sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
while (true)
{
start = GetClockSec();
int batch = batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true);
mkinput += GetClockSec() - start;
if (!batch) {
break;
}
time = GetClockSec();
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */
XTensor output;
start = GetClockSec();
/* make the network */
if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true);
......@@ -196,11 +217,12 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
else{
ShowNTErrors("Illegal model type!");
}
forward += GetClockSec() - start;
/* back-propagation for obtaining gradients */
//if (labelSmoothingP > 0)
// LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
start = GetClockSec();
XTensor labelOnehot;
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
......@@ -229,7 +251,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
net.Backward(lossTensor);
//net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
//net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
backward += GetClockSec() - start;
start = GetClockSec();
gradStep += 1;
loss += prob;
wordCount += wc;
......@@ -248,11 +272,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
gradStep = 0;
validStep++;
update += GetClockSec() - start;
}
}
else
nSkipped++;
train_time += GetClockSec() - time;
if(++step >= nstep){
isEnd = true;
break;
......@@ -260,11 +286,19 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
startT = GetClockSec();
XPRINT6(0, stderr, "[Time] elapsed=%.5lfs,mkinput=%.5lfs,train_time=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf\n",
elapsed, mkinput,train_time, forward, backward, update);
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
if (!doUpdate)
XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n");
mkinput = 0.0;
train_time = 0.0;
forward = 0.0;
backward = 0.0;
update = 0.0;
}
if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
......
......@@ -25,6 +25,7 @@
* $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
*
*/
#include "halfLib/half/half.hpp"
#include <stdio.h>
#include <stdlib.h>
......@@ -50,6 +51,11 @@
#include "function/Identity.h"
#include "core/CHeader.h"
//#include "halfLib/HalfFloat/umHalf.h"
#ifdef USE_CUDA
// the CUDA stuff
......@@ -376,6 +382,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
XMemCopy(data, devID, tensor.data, tensor.devID, size);
if(dataHost != NULL && tensor.dataHost != NULL)
XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
}
else{
DestroyData();
......@@ -1854,6 +1861,16 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
}
}
else if (dataType==X_FLOAT16) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for (int i = beg; i < end; i++) {
halfCPU f = ((halfCPU*)d)[i];
if (i == beg)
fprintf(file, "%hx", f);
else
fprintf(file, " %hx", f);
}
}
else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
......@@ -1900,9 +1917,22 @@ dump data to a file
*/
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
if (tensor->dataType == X_FLOAT)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else if (tensor->dataType == X_FLOAT16)
{
XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
_ConvertDataType(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else
{
ShowNTErrors("TO DO!");
}
}
/*
......@@ -1980,6 +2010,14 @@ void XTensor::Read(FILE * file, const char * label)
}
}
}
else if (dataType==X_FLOAT16){
for (int i = 0; i < unitNum; i++) {
halfCPU * f = ((halfCPU*)data) + i;
if (fscanf(file, "%hx", f) < 1) {
ShowNTErrors("Incorrect tensor format!");
}
}
}
else {
ShowNTErrors("TODO!");
}
......@@ -2006,15 +2044,13 @@ void XTensor::Read(FILE * file, const char * label)
}
}
do {
c = fgetc(file);
} while (c != '\n' && c != EOF);
XMemCopy(dataBackup, devID, data, -1, GetDataSizeInChar());
data = dataBackup;
delete[](char*)dataBuf;
delete[](char *)dataBuf;
}
/*
......
......@@ -97,7 +97,7 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
/* copy the data from GPU memory to CPU memory */
void CudaGPUToCPUFlush(XTensor * tensor)
{
CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
//CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
if (tensor->dataHost != NULL)
delete[](char*)tensor->dataHost;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论