Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
T
Tensor.LowPrecision
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
linye
Tensor.LowPrecision
Commits
1da50ae2
Commit
1da50ae2
authored
Aug 05, 2019
by
ltb
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
using cpu float16 and test fnn and t2t times
parent
29d2352b
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
1089 行增加
和
176 行删除
+1089
-176
source/network/Main.cpp
+958
-151
source/sample/fnnlm/FNNLM.cpp
+45
-9
source/sample/transformer/T2TTrainer.cpp
+43
-9
source/tensor/XTensor.cpp
+42
-6
source/tensor/core/utilities/FlushToMem.cu
+1
-1
没有找到文件。
source/network/Main.cpp
查看文件 @
1da50ae2
...
...
@@ -15,9 +15,9 @@
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
#include <stdio.h>
#include "XNet.h"
...
...
@@ -28,190 +28,996 @@
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
using
namespace
nts
;
using
namespace
fnnlm
;
using
namespace
transformer
;
void
BackwardTest
();
void
TransposeTest
();
void
SumDimTest
();
using
namespace
nts
;
using
namespace
fnnlm
;
using
namespace
transformer
;
void
BackwardTest
();
void
TransposeTest
();
void
SumDimTest
();
//void SplitBackwardTest();
void
MemTest
();
//void xcTest();
void
ConvertDataTypeTest
();
void
ConvertDataTypeBackwardTest
();
void
SumFP16Test
();
void
GatherFP16Test
();
void
HardTanHFP16Test
();
void
ReduceMaxFP16Test
();
void
ReduceSumFP16Test
();
void
LogSoftmaxFP16Test
();
void
ClipFP16Test
();
void
ScaleAndShiftFP16Test
();
void
InitTensorFP16Test
();
void
MultiplyDimTime
();
void
TimeTestGemm
();
void
TimeTest
();
void
TimeInt8AndFloat32
();
void
TestCPUhalf
();
int
main
(
int
argc
,
const
char
**
argv
)
{
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
Test
();
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
FNNLMMain
(
argc
-
1
,
argv
+
1
);
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-t2t"
))
TransformerMain
(
argc
-
1
,
argv
+
1
);
else
{
fprintf
(
stderr
,
"Thanks for using NiuTrans.Network! This is a library for building
\n
"
);
fprintf
(
stderr
,
"neural networks in an easy way.
\n\n
"
);
fprintf
(
stderr
,
"Run this program with
\"
-test
\"
for unit test!
\n
"
);
fprintf
(
stderr
,
"Or run this program with
\"
-fnnlm
\"
for sample FNNLM!
\n
"
);
}
//xcTest();
//return 0;
//MemTest();
//return 0;
//SplitBackwardTest();
//return 0;
//_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
//Test();
//return 0;
//ConvertDataTypeTest();
//return 0;
//ConvertDataTypeBackwardTest();
//return 0;
//SumFP16Test();
//return 0;
//GatherFP16Test();
//return 0;
//HardTanHFP16Test();
//return 0;
//ReduceMaxFP16Test();
//return 0;
//ReduceSumFP16Test();
//return 0;
//LogSoftmaxFP16Test();
//return 0;
//ClipFP16Test();
//return 0;
//ScaleAndShiftFP16Test();
//return 0;
//InitTensorFP16Test();
//return 0;
//_CrtDumpMemoryLeaks();
return
0
;
}
void
TestCPUhalf
()
{
int
memSize
=
1024
;
int
devId
=
0
;
int
dim1
=
1024
;
int
dim2
=
32
;
XMem
*
mem
;
mem
=
new
XMem
(
devId
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
*
256
,
1024
,
MILLION
*
128
);
mem
->
SetDesiredSize
(
devId
,
0
,
(
MTYPE
)
memSize
*
MILLION
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D
(
&
a
,
dim1
,
dim1
,
X_FLOAT
,
devId
);
InitTensor2D
(
&
b
,
dim2
,
dim2
,
X_FLOAT
,
devId
);
InitTensor2D
(
&
c
,
dim1
,
dim1
,
X_FLOAT
,
devId
);
}
void
TimeInt8AndFloat32
()
{
XMem
*
mem
;
int
memSize
=
1024
;
int
devId
=
2
;
int
dim
=
512
;
mem
=
new
XMem
(
devId
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
*
256
,
1024
,
MILLION
*
128
);
mem
->
SetDesiredSize
(
devId
,
0
,
(
MTYPE
)
memSize
*
MILLION
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
InitTensor2D
(
&
a
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
InitTensor2D
(
&
b
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
InitTensor2D
(
&
c
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
XTensor
inta
;
XTensor
intb
;
XTensor
intc
;
InitTensor2D
(
&
inta
,
dim
,
dim
,
X_INT
,
devId
,
mem
);
InitTensor2D
(
&
intb
,
dim
,
dim
,
X_INT
,
devId
,
mem
);
InitTensor2D
(
&
intc
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
XTensor
tmp
;
InitTensor2D
(
&
tmp
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
tmp
.
SetDataRand
(
-
100000.0
F
,
100000.0
F
);
inta
=
ConvertDataType
(
tmp
,
X_INT8
);
intb
=
ConvertDataType
(
tmp
,
X_INT8
);
int
repeat
=
10000
;
printf
(
"test on matrixmul
\n
"
);
double
start_matrixmul32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
_MatrixMul
(
&
a
,
X_NOTRANS
,
&
b
,
X_NOTRANS
,
&
c
);
}
double
elapsed_matrixmul32
=
GetClockSec
()
-
start_matrixmul32
;
printf
(
"elapsed_matrixmul32=%.2fs
\n
"
,
elapsed_matrixmul32
);
double
start_int8
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
_MatrixMul
(
&
inta
,
X_NOTRANS
,
&
intb
,
X_NOTRANS
,
&
intc
);
}
double
elapsed_int8
=
GetClockSec
()
-
start_int8
;
printf
(
"elapsed_int8=%.2fs
\n
"
,
elapsed_int8
);
}
void
TimeTest
()
{
XMem
*
mem
;
int
memSize
=
1024
;
int
devId
=
0
;
int
dim
=
512
;
mem
=
new
XMem
(
devId
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
*
256
,
1024
,
MILLION
*
128
);
mem
->
SetDesiredSize
(
devId
,
0
,
(
MTYPE
)
memSize
*
MILLION
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
halfa
;
XTensor
halfb
;
XTensor
halfc
;
InitTensor2D
(
&
a
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
InitTensor2D
(
&
b
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
InitTensor2D
(
&
c
,
dim
,
dim
,
X_FLOAT
,
devId
,
mem
);
InitTensor2D
(
&
halfc
,
dim
,
dim
,
X_FLOAT16
,
devId
,
mem
);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfb
=
ConvertDataType
(
b
,
X_FLOAT16
);
int
repeat
=
100000
;
printf
(
"=========================================
\n
"
);
printf
(
"test on sum
\n
"
);
double
start_sum32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
Sum
(
&
a
,
&
b
);
}
double
elapsed_sum32
=
GetClockSec
()
-
start_sum32
;
printf
(
"elapsed_sum32=%.2fs
\n
"
,
elapsed_sum32
);
double
start_sum16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
Sum
(
&
halfa
,
&
halfb
);
}
double
elapsed_sum16
=
GetClockSec
()
-
start_sum16
;
printf
(
"elapsed_sum16=%.2fs
\n
"
,
elapsed_sum16
);
printf
(
"=========================================
\n
"
);
/*printf("test on sub\n");
double start_sub32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Sub(&a, &b);
}
double elapsed_sub32 = GetClockSec() - start_sub32;
printf("elapsed_sub32=%.2fs\n", elapsed_sub32);
double start_sub16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Sub(&halfa, &halfb);
}
double elapsed_sub16 = GetClockSec() - start_sub16;
printf("elapsed_sub16=%.2fs\n", elapsed_sub16);
printf("=========================================\n");*/
/*printf("test on div\n");
double start_div32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Div(&a, &b);
}
double elapsed_div32 = GetClockSec() - start_div32;
printf("elapsed_div32=%.2fs\n", elapsed_div32);
double start_div16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Div(&halfa, &halfb);
}
double elapsed_div16 = GetClockSec() - start_div16;
printf("elapsed_div16=%.2fs\n", elapsed_div16);
printf("=========================================\n");*/
/*printf("test on multiply\n");
double start_multiply32 = GetClockSec();
for (int i = 0; i < repeat; i++) {
c = Multiply(&a, &b);
}
double elapsed_multiply32 = GetClockSec() - start_multiply32;
printf("elapsed_multiply32=%.2fs\n", elapsed_multiply32);
double start_multiply16 = GetClockSec();
for (int i = 0; i < repeat; i++) {
halfc = Multiply(&halfa, &halfb);
}
double elapsed_multiply16 = GetClockSec() - start_multiply16;
printf("elapsed_multiply16=%.2fs\n", elapsed_multiply16);
printf("=========================================\n");*/
printf
(
"test on scaleandshift
\n
"
);
double
start_scaleandshift32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
ScaleAndShift
(
&
a
,
1
,
0
);
}
double
elapsed_scaleandshift32
=
GetClockSec
()
-
start_scaleandshift32
;
printf
(
"elapsed_scaleandshift32=%.2fs
\n
"
,
elapsed_scaleandshift32
);
double
start_scaleandshift16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
ScaleAndShift
(
&
halfa
,
1
,
0
);
}
double
elapsed_scaleandshift16
=
GetClockSec
()
-
start_scaleandshift16
;
printf
(
"elapsed_scaleandshift16=%.2fs
\n
"
,
elapsed_scaleandshift16
);
printf
(
"=========================================
\n
"
);
printf
(
"test on reducesum
\n
"
);
double
start_reducesum32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
ReduceSum
(
&
a
,
1
);
}
double
elapsed_reducesum32
=
GetClockSec
()
-
start_reducesum32
;
printf
(
"elapsed_reducesum32=%.2fs
\n
"
,
elapsed_reducesum32
);
double
start_reducesum16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
ReduceSum
(
&
halfa
,
1
);
}
double
elapsed_reducesum16
=
GetClockSec
()
-
start_reducesum16
;
printf
(
"elapsed_reducesum16=%.2fs
\n
"
,
elapsed_reducesum16
);
printf
(
"=========================================
\n
"
);
printf
(
"test on reducemax
\n
"
);
double
start_reducemax32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
ReduceMax
(
&
a
,
1
);
}
double
elapsed_reducemax32
=
GetClockSec
()
-
start_reducemax32
;
printf
(
"elapsed_reducemax32=%.2fs
\n
"
,
elapsed_reducemax32
);
double
start_reducemax16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
ReduceMax
(
&
halfa
,
1
);
}
double
elapsed_reducemax16
=
GetClockSec
()
-
start_reducemax16
;
printf
(
"elapsed_reducemax16=%.2fs
\n
"
,
elapsed_reducemax16
);
printf
(
"=========================================
\n
"
);
printf
(
"test on logsoftmax
\n
"
);
double
start_logsoftmax32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
LogSoftmax
(
&
a
,
1
);
}
double
elapsed_logsoftmax32
=
GetClockSec
()
-
start_logsoftmax32
;
printf
(
"elapsed_logsoftmax32=%.2fs
\n
"
,
elapsed_logsoftmax32
);
double
start_logsoftmax16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
LogSoftmax
(
&
halfa
,
1
);
}
double
elapsed_logsoftmax16
=
GetClockSec
()
-
start_logsoftmax16
;
printf
(
"elapsed_logsoftmax16=%.2fs
\n
"
,
elapsed_logsoftmax16
);
printf
(
"=========================================
\n
"
);
printf
(
"test on matrixmul
\n
"
);
double
start_matrixmul32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
MatrixMul
(
&
a
,
&
b
);
}
double
elapsed_matrixmul32
=
GetClockSec
()
-
start_matrixmul32
;
printf
(
"elapsed_matrixmul32=%.2fs
\n
"
,
elapsed_matrixmul32
);
double
start_matrixmul16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
MatrixMul
(
&
halfa
,
&
halfb
);
}
double
elapsed_matrixmul16
=
GetClockSec
()
-
start_matrixmul16
;
printf
(
"elapsed_matrixmul16=%.2fs
\n
"
,
elapsed_matrixmul16
);
printf
(
"=========================================
\n
"
);
printf
(
"test on convert
\n
"
);
double
start_convert32to16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
}
double
elapsed_convert32to16
=
GetClockSec
()
-
start_convert32to16
;
printf
(
"elapsed_convert32to16=%.2fs
\n
"
,
elapsed_convert32to16
);
double
start_convert16to32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
a
=
ConvertDataType
(
halfa
,
X_FLOAT
);
}
double
elapsed_convert16to32
=
GetClockSec
()
-
start_convert16to32
;
printf
(
"elapsed_convert16to32=%.2fs
\n
"
,
elapsed_convert16to32
);
printf
(
"=========================================
\n
"
);
delete
mem
;
}
void
MultiplyDimTime
()
{
int
memSize
=
1024
;
int
devId
=
0
;
int
dim1
=
1024
;
int
dim2
=
32
;
XMem
*
mem
;
mem
=
new
XMem
(
devId
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
*
256
,
1024
,
MILLION
*
128
);
mem
->
SetDesiredSize
(
devId
,
0
,
(
MTYPE
)
memSize
*
MILLION
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
//XMem *mem = new XMem(devId, FREE_ON_THE_FLY, 128 * MILLION, 1024, 128 * MILLION);
//mem->SetDesiredSize(0,0,memSize*MILLION);
InitTensor2D
(
&
a
,
dim1
,
dim1
,
X_FLOAT
,
devId
);
InitTensor2D
(
&
b
,
dim2
,
dim2
,
X_FLOAT
,
devId
);
InitTensor2D
(
&
c
,
dim1
,
dim1
,
X_FLOAT
,
devId
);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
b
.
SetDataRandn
(
-
1.0
F
,
1.0
F
);
int
repeat
=
2000
;
printf
(
"test on MultiplyDim
\n
"
);
double
start
=
GetClockSec
();
for
(
int
j
=
0
;
j
<=
repeat
;
j
++
)
{
c
=
MultiplyDim
(
&
a
,
&
b
,
0
);
}
double
elapsed
=
GetClockSec
()
-
start
;
printf
(
"elapsed_MultiplyDim32=%.4fs
\n
"
,
elapsed
);
XTensor
halfa
;
XTensor
halfb
;
XTensor
halfc
;
InitTensor2D
(
&
halfc
,
dim1
,
dim1
,
X_FLOAT16
,
devId
,
mem
);
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfb
=
ConvertDataType
(
b
,
X_FLOAT16
);
double
starthalf
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
MultiplyDim
(
&
halfa
,
&
halfb
,
0
);
}
double
elapsedhalf
=
GetClockSec
()
-
starthalf
;
printf
(
"elapsed_MultiplyDim16=%.4fs
\n
"
,
elapsedhalf
);
}
void
TimeTestGemm
()
{
XMem
*
mem
;
int
memSize
=
1024
;
delete
mem
;
mem
=
new
XMem
(
0
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
*
256
,
1024
,
MILLION
*
128
);
mem
->
SetDesiredSize
(
0
,
0
,
(
MTYPE
)
memSize
*
MILLION
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
halfa
;
XTensor
halfb
;
XTensor
halfc
;
int
dim1
=
512
;
int
dim2
=
1024
;
//InitTensor3D(&a, 86, 48, 256, X_FLOAT, 0, mem);
//InitTensor2D(&b, 256, 256, X_FLOAT, 0, mem);
//InitTensor4D(&a, 8, 86, 48, 48, X_FLOAT, 0, mem);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0, mem);
InitTensor2D
(
&
a
,
dim1
,
dim2
,
X_FLOAT
,
0
,
mem
);
InitTensor2D
(
&
b
,
dim1
,
dim2
,
X_FLOAT
,
0
,
mem
);
//InitTensor4D(&a, 8, 86, 48, 32, X_FLOAT, 0);
//InitTensor4D(&b, 8, 86, 48, 32, X_FLOAT, 0);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfb
=
ConvertDataType
(
b
,
X_FLOAT16
);
//a.Dump(&a, stderr, "a:", 10);
//b.Dump(&b, stderr, "b:", 10);
//halfa.Dump(&a, stderr, "halfa:", 10);
//halfb.Dump(&b, stderr, "halfb:", 10);
int
repeat
=
10000
;
printf
(
"=========================================
\n
"
);
double
start_matrixmul16
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
halfc
=
BMMul
(
halfa
,
X_NOTRANS
,
halfb
,
X_TRANS
);
}
double
elapsed_matrixmul16
=
GetClockSec
()
-
start_matrixmul16
;
printf
(
"elapsed_matrixmul16=%.4fs
\n
"
,
elapsed_matrixmul16
);
printf
(
"------------------------------------------
\n
"
);
double
start_matrixmul32
=
GetClockSec
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
c
=
BMMul
(
a
,
X_NOTRANS
,
b
,
X_TRANS
);
}
double
elapsed_matrixmul32
=
GetClockSec
()
-
start_matrixmul32
;
printf
(
"elapsed_matrixmul32=%.4fs
\n
"
,
elapsed_matrixmul32
);
printf
(
"=========================================
\n
"
);
c
.
Dump
(
&
c
,
stderr
,
"c:"
,
10
);
halfc
.
Dump
(
&
halfc
,
stderr
,
"halfc:"
,
10
);
}
void
InitTensorFP16Test
()
{
XTensor
a
;
InitTensor2D
(
&
a
,
1
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
XTensor
halfA
;
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfA
.
Dump
(
&
halfA
,
stderr
,
"halfA:"
);
XTensor
b
;
InitTensor2D
(
&
b
,
1
,
10
,
X_FLOAT16
,
0
);
_SetDataRand
(
&
b
,
-
10.0
F
,
10.0
F
);
b
.
Dump
(
&
b
,
stderr
,
"b:"
);
}
void
ScaleAndShiftFP16Test
()
{
XTensor
a
;
XTensor
intA
;
XTensor
b
;
XTensor
intB
;
InitTensor2D
(
&
a
,
1
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
int
main
(
int
argc
,
const
char
**
argv
)
a
.
Dump
(
stderr
,
"a:"
);
intA
=
ConvertDataType
(
a
,
X_INT
);
intB
=
ScaleAndShift
(
intA
,
2
,
0
);
b
=
ConvertDataType
(
intB
,
X_FLOAT
);
b
.
Dump
(
stderr
,
"b:"
);
}
void
ClipFP16Test
()
{
XTensor
a
;
XTensor
intA
;
XTensor
b
;
XTensor
intB
;
InitTensor2D
(
&
a
,
1
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
intA
=
ConvertDataType
(
a
,
X_INT
);
intB
=
Clip
(
intA
,
-
1
,
1
);
b
=
ConvertDataType
(
intB
,
X_FLOAT
);
b
.
Dump
(
stderr
,
"b:"
);
}
void
LogSoftmaxFP16Test
()
{
XTensor
a
;
XTensor
halfA
;
XTensor
b
;
XTensor
halfB
;
InitTensor3D
(
&
a
,
2
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
1.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
b
=
LogSoftmax
(
a
,
1
);
halfB
=
LogSoftmax
(
halfA
,
1
);
b
.
Dump
(
stderr
,
"sum:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfSum:"
);
}
void
ReduceSumFP16Test
()
{
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
Test
();
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
FNNLMMain
(
argc
-
1
,
argv
+
1
);
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-t2t"
))
TransformerMain
(
argc
-
1
,
argv
+
1
);
else
{
fprintf
(
stderr
,
"Thanks for using NiuTrans.Network! This is a library for building
\n
"
);
fprintf
(
stderr
,
"neural networks in an easy way.
\n\n
"
);
fprintf
(
stderr
,
"Run this program with
\"
-test
\"
for unit test!
\n
"
);
fprintf
(
stderr
,
"Or run this program with
\"
-fnnlm
\"
for sample FNNLM!
\n
"
);
fprintf
(
stderr
,
"Or run this program with
\"
-t2t
\"
for sample Transformer!
\n
"
);
}
//_CrtDumpMemoryLeaks();
return
0
;
XTensor
a
;
XTensor
sum
;
XTensor
halfA
;
XTensor
halfSum
;
InitTensor2D
(
&
a
,
10
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
sum
=
ReduceSum
(
a
,
1
);
halfSum
=
ReduceSum
(
halfA
,
1
);
sum
.
Dump
(
stderr
,
"sum:"
);
halfSum
.
Dump
(
&
halfSum
,
stderr
,
"halfSum:"
);
}
void
ReduceMaxFP16Test
()
{
XTensor
a
;
XTensor
max
;
XTensor
halfA
;
XTensor
halfMax
;
InitTensor2D
(
&
a
,
10
,
10
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
max
=
ReduceMax
(
a
,
1
);
halfMax
=
ReduceMax
(
halfA
,
1
);
max
.
Dump
(
stderr
,
"max:"
);
halfMax
.
Dump
(
&
halfMax
,
stderr
,
"halfMax:"
);
}
void
HardTanHFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
halfA
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
5
,
5
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5
,
5
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
b
=
HardTanH
(
a
);
halfB
=
HardTanH
(
halfA
);
b
.
Dump
(
stderr
,
"b:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB:"
);
}
void
GatherFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
srcIndex
;
XTensor
halfA
;
XTensor
halfB
;
XTensor
c
;
InitTensor1D
(
&
srcIndex
,
2
,
X_INT
,
0
);
int
m
=
0
;
int
n
=
1
;
srcIndex
.
Set1DInt
(
m
,
0
);
srcIndex
.
Set1DInt
(
n
,
1
);
InitTensor2D
(
&
a
,
3
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
2
,
2
,
X_FLOAT
,
0
);
InitTensor2D
(
&
halfB
,
2
,
2
,
X_FLOAT16
,
0
);
a
.
SetDataRand
(
-
5.0
F
,
5.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
_Gather
(
&
a
,
&
b
,
&
srcIndex
);
b
.
Dump
(
stderr
,
"b:"
);
_Gather
(
&
halfA
,
&
halfB
,
&
srcIndex
);
c
=
ConvertDataType
(
halfB
,
X_FLOAT
);
c
.
Dump
(
stderr
,
"c:"
);
}
void
SumFP16Test
()
{
XTensor
a
;
XTensor
b
;
XTensor
halfA
;
XTensor
halfB
;
InitTensor2D
(
&
a
,
5
,
5
,
X_FLOAT
,
0
);
InitTensor2D
(
&
b
,
5
,
5
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
b
.
SetDataRand
(
-
1.0
F
,
4.0
F
);
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
halfB
=
ConvertDataType
(
b
,
X_FLOAT16
);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
b
=
Sum
(
a
,
b
,
-
0.4
F
);
halfB
=
Sum
(
halfA
,
halfB
,
-
0.4
F
);
b
.
Dump
(
stderr
,
"b:"
);
halfB
.
Dump
(
&
halfB
,
stderr
,
"halfB:"
);
}
void
ConvertDataTypeTest
()
{
int
rnum
=
0
;
for
(
int
i
=
0
;
i
<=
rnum
;
i
++
)
{
XTensor
a
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
XTensor
halfa
;
InitTensor2D
(
&
halfa
,
2
,
2
,
X_FLOAT16
,
0
);
XTensor
a1
;
InitTensor2D
(
&
a1
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
-
10.0
F
,
10.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
halfa
=
ConvertDataType
(
a
,
X_FLOAT16
);
a1
=
ConvertDataType
(
halfa
,
X_FLOAT
);
a1
.
Dump
(
stderr
,
"halfa:"
);
}
}
void
ConvertDataTypeBackwardTest
()
{
int
rnum
=
0
;
for
(
int
i
=
0
;
i
<=
rnum
;
i
++
)
{
XTensor
a
;
InitTensor2D
(
&
a
,
2
,
2
,
X_FLOAT
,
0
);
a
.
SetDataRand
(
2.0
F
,
2.0
F
);
a
.
Dump
(
stderr
,
"a:"
);
XTensor
halfA
;
XTensor
a1
;
halfA
=
ConvertDataType
(
a
,
X_FLOAT16
);
a1
=
ConvertDataType
(
halfA
,
X_FLOAT
);
a1
.
grad
=
NewTensor
(
&
a1
);
a1
.
grad
->
SetDataRand
(
3.0
F
,
3.0
F
);
a1
.
grad
->
Dump
(
stderr
,
"a1.grad:"
);
XNet
testBackward
;
printf
(
"1"
);
testBackward
.
Backward
(
a1
);
printf
(
"2"
);
halfA
.
grad
->
Dump
(
stderr
,
"halfA.grad:"
);
a
.
grad
->
Dump
(
stderr
,
"a.grad:"
);
}
}
//XTensor * stack(XList& list, int leadingDim)
//{
// size_t size = list.count;
// if (list.count == 0)
// return NULL;
// XTensor * sample = (XTensor*)list.Get(0);
//
// XTensor merge_tensor;
// int order = sample->order;
// int * dim = new int[order];
// for (int i = 0; i < order; i++)
// dim[i] = sample->GetDim(i);
// dim[leadingDim] *= size;
//
// InitTensor(&merge_tensor, order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
//
// _Merge(&list, &merge_tensor, leadingDim);
// delete[] dim;
//
// order += 1;
// dim = new int[order];
// dim[0] = size;
// for (size_t i = 1; i < order; i++) {
// if (i != leadingDim)
// dim[i] = sample->GetDim(i - 1);
// else
// dim[i] = sample->GetDim(i - 1) / size;
// }
//
// XTensor * split_tensor = new XTensor(order, dim, DEFAULT_DTYPE, sample->denseRatio, sample->devID, sample->mem);
// _Split(&merge_tensor, split_tensor, leadingDim, size);
// delete[] dim;
//
// return split_tensor;
//}
//void xcTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// _SetDataFixed(&t1, 1.0F);
// _SetDataFixed(&t2, 2.0F);
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&t1);
// smalls.Add(&t2);
//
// XTensor* result = stack(smalls, 0);
// result->Dump(stderr, "", 100);
//}
void
BackwardTest
()
{
XNet
net
;
XNet
net
;
XTensor
a
;
XTensor
b
;
XTensor
c
;
a
.
enableGrad
=
true
;
b
.
enableGrad
=
false
;
c
.
enableGrad
=
false
;
XTensor
mean
;
XTensor
origin
;
InitTensor2D
(
&
a
,
2
,
3
);
InitTensor1D
(
&
b
,
2
);
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
mean
;
XTensor
origin
;
InitTensor2D
(
&
a
,
2
,
3
);
InitTensor1D
(
&
b
,
2
);
a
.
SetZeroAll
();
b
.
SetZeroAll
();
a
.
Set2D
(
1.0
F
,
0
,
0
);
a
.
Set2D
(
2.0
F
,
0
,
1
);
a
.
Set2D
(
3.0
F
,
0
,
2
);
a
.
Set2D
(
4.0
F
,
1
,
0
);
a
.
Set2D
(
5.0
F
,
1
,
1
);
a
.
Set2D
(
6.0
F
,
1
,
2
);
a
.
SetZeroAll
();
b
.
SetZeroAll
();
a
.
Set2D
(
1.0
F
,
0
,
0
);
a
.
Set2D
(
2.0
F
,
0
,
1
);
a
.
Set2D
(
3.0
F
,
0
,
2
);
a
.
Set2D
(
4.0
F
,
1
,
0
);
a
.
Set2D
(
5.0
F
,
1
,
1
);
a
.
Set2D
(
6.0
F
,
1
,
2
);
b
.
Set1D
(
2.0
F
,
0
);
b
.
Set1D
(
1.0
F
,
1
);
b
.
Set1D
(
2.0
F
,
0
);
b
.
Set1D
(
1.0
F
,
1
);
DivDim
(
a
,
b
,
c
,
0
);
c
.
Dump
(
stderr
,
"c:"
);
auto
loss
=
CrossEntropy
(
c
,
a
);
c
=
DivDim
(
a
,
b
,
0
);
c
.
Dump
(
stderr
,
"c:"
);
//XLink::ShowNetwork(stderr, &c);
//XLink::ShowNetwork(stderr, &c);
net
.
Backward
(
loss
);
net
.
Backward
(
c
);
a
.
grad
->
Dump
(
stderr
);
net
.
Dump
(
stderr
);
}
void
TransposeTest
()
{
#ifdef USE_CUDA
XMem
mem0
(
0
,
UNI_FREE
,
MILLION
*
64
,
1024
,
MILLION
*
64
);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor
x
;
XTensor
y
;
XTensor
z
;
int
loops
=
2000
;
int
B
=
3
*
2
*
4
;
int
K
=
8
*
1
;
int
N
=
50
;
int
H
=
512
*
4
;
int
nnn
=
GDevs
.
nGPU
;
InitTensor3D
(
&
x
,
B
,
N
,
H
,
X_FLOAT
,
0
);
InitTensor4D
(
&
y
,
K
,
B
,
N
,
H
/
K
,
X_FLOAT
,
0
);
InitTensor3D
(
&
z
,
B
,
N
,
H
,
X_FLOAT
,
0
);
cudaEvent_t
ctime0
;
cudaEvent_t
ctime1
;
cudaEvent_t
ctime2
;
cudaEvent_t
ctime3
;
cudaEvent_t
ctime4
;
cudaEvent_t
ctime5
;
float
elapsedSplit
=
0.0
;
float
elapsedMerge
=
0.0
;
float
elapsedSum
=
0.0
;
cudaEventCreate
(
&
ctime0
);
cudaEventCreate
(
&
ctime1
);
cudaEventCreate
(
&
ctime2
);
cudaEventCreate
(
&
ctime3
);
cudaEventCreate
(
&
ctime4
);
cudaEventCreate
(
&
ctime5
);
cudaEventRecord
(
ctime0
,
0
);
double
time0
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Split
(
&
x
,
&
y
,
2
,
K
);
double
time1
=
GetClock
();
cudaEventRecord
(
ctime1
,
0
);
cudaEventSynchronize
(
ctime1
);
cudaEventElapsedTime
(
&
elapsedSplit
,
ctime0
,
ctime1
);
cudaEventRecord
(
ctime2
,
0
);
double
time2
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Merge
(
&
y
,
&
x
,
3
);
double
time3
=
GetClock
();
cudaEventRecord
(
ctime3
,
0
);
cudaEventSynchronize
(
ctime3
);
cudaEventElapsedTime
(
&
elapsedMerge
,
ctime2
,
ctime3
);
cudaEventRecord
(
ctime4
,
0
);
double
time4
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Sum
(
&
x
,
&
z
,
&
x
);
double
time5
=
GetClock
();
cudaEventRecord
(
ctime5
,
0
);
cudaEventSynchronize
(
ctime5
);
cudaEventElapsedTime
(
&
elapsedSum
,
ctime4
,
ctime5
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
time1
-
time0
,
time3
-
time2
,
time5
-
time4
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
elapsedSplit
,
elapsedMerge
,
elapsedSum
);
XMem
mem0
(
0
,
UNI_FREE
,
MILLION
*
64
,
1024
,
MILLION
*
64
);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor
x
;
XTensor
y
;
XTensor
z
;
int
loops
=
2000
;
int
B
=
3
*
2
*
4
;
int
K
=
8
*
1
;
int
N
=
50
;
int
H
=
512
*
4
;
int
nnn
=
GDevs
.
nGPU
;
InitTensor3D
(
&
x
,
B
,
N
,
H
,
X_FLOAT
,
0
);
InitTensor4D
(
&
y
,
K
,
B
,
N
,
H
/
K
,
X_FLOAT
,
0
);
InitTensor3D
(
&
z
,
B
,
N
,
H
,
X_FLOAT
,
0
);
cudaEvent_t
ctime0
;
cudaEvent_t
ctime1
;
cudaEvent_t
ctime2
;
cudaEvent_t
ctime3
;
cudaEvent_t
ctime4
;
cudaEvent_t
ctime5
;
float
elapsedSplit
=
0.0
;
float
elapsedMerge
=
0.0
;
float
elapsedSum
=
0.0
;
cudaEventCreate
(
&
ctime0
);
cudaEventCreate
(
&
ctime1
);
cudaEventCreate
(
&
ctime2
);
cudaEventCreate
(
&
ctime3
);
cudaEventCreate
(
&
ctime4
);
cudaEventCreate
(
&
ctime5
);
cudaEventRecord
(
ctime0
,
0
);
double
time0
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Split
(
&
x
,
&
y
,
2
,
K
);
double
time1
=
GetClock
();
cudaEventRecord
(
ctime1
,
0
);
cudaEventSynchronize
(
ctime1
);
cudaEventElapsedTime
(
&
elapsedSplit
,
ctime0
,
ctime1
);
cudaEventRecord
(
ctime2
,
0
);
double
time2
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Merge
(
&
y
,
&
x
,
3
);
double
time3
=
GetClock
();
cudaEventRecord
(
ctime3
,
0
);
cudaEventSynchronize
(
ctime3
);
cudaEventElapsedTime
(
&
elapsedMerge
,
ctime2
,
ctime3
);
cudaEventRecord
(
ctime4
,
0
);
double
time4
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Sum
(
&
x
,
&
z
,
&
x
);
double
time5
=
GetClock
();
cudaEventRecord
(
ctime5
,
0
);
cudaEventSynchronize
(
ctime5
);
cudaEventElapsedTime
(
&
elapsedSum
,
ctime4
,
ctime5
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
time1
-
time0
,
time3
-
time2
,
time5
-
time4
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
elapsedSplit
,
elapsedMerge
,
elapsedSum
);
#endif
}
void
SumDimTest
()
{
XTensor
x
;
XTensor
y
;
XTensor
z
;
XTensor
x
;
XTensor
y
;
XTensor
z
;
int
a
=
5
;
int
b
=
7
;
int
c
=
3
;
int
a
=
5
;
int
b
=
7
;
int
c
=
3
;
InitTensor3D
(
&
x
,
a
,
b
,
c
,
X_FLOAT
,
-
1
);
InitTensor1D
(
&
y
,
c
,
X_FLOAT
,
-
1
);
InitTensor3D
(
&
z
,
a
,
b
,
c
,
X_FLOAT
,
-
1
);
InitTensor3D
(
&
x
,
a
,
b
,
c
,
X_FLOAT
,
-
1
);
InitTensor1D
(
&
y
,
c
,
X_FLOAT
,
-
1
);
InitTensor3D
(
&
z
,
a
,
b
,
c
,
X_FLOAT
,
-
1
);
x
.
SetZeroAll
();
y
.
SetZeroAll
();
z
.
SetZeroAll
();
x
.
SetZeroAll
();
y
.
SetZeroAll
();
z
.
SetZeroAll
();
DTYPE
*
data
=
new
DTYPE
[
x
.
unitNum
];
DTYPE
*
data
=
new
DTYPE
[
x
.
unitNum
];
for
(
int
i
=
0
;
i
<
x
.
unitNum
;
i
++
)
data
[
i
]
=
(
DTYPE
)
i
;
x
.
SetData
(
data
,
x
.
unitNum
);
for
(
int
i
=
0
;
i
<
x
.
unitNum
;
i
++
)
data
[
i
]
=
(
DTYPE
)
i
;
x
.
SetData
(
data
,
x
.
unitNum
);
for
(
int
i
=
0
;
i
<
y
.
unitNum
;
i
++
)
data
[
i
]
=
-
(
DTYPE
)
i
;
y
.
SetData
(
data
,
y
.
unitNum
);
for
(
int
i
=
0
;
i
<
y
.
unitNum
;
i
++
)
data
[
i
]
=
-
(
DTYPE
)
i
;
y
.
SetData
(
data
,
y
.
unitNum
);
_SumDim
(
&
x
,
&
y
,
&
z
,
2
);
_SumDim
(
&
x
,
&
y
,
&
z
,
2
);
z
.
Dump
(
stderr
,
"z:"
);
z
.
Dump
(
stderr
,
"z:"
);
delete
[]
data
;
delete
[]
data
;
}
//void SplitBackwardTest()
//{
// int * dimSize = new int[2];
// dimSize[0] = 2;
// dimSize[1] = 4;
//
// XTensor t1;
// InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
// XTensor t2;
// InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
// XTensor tensor;
//
// //_SetDataFixedFloat(&t1, 1.0F);
// //_SetDataFixedFloat(&t2, 2.0F);
// t1.SetDataRand();
// t2.SetDataRand();
//
// tensor = t1 + t2;
//
// XList smalls;
//
// XTensor first;
// XTensor second;
// InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
// InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
// smalls.Add(&first);
// smalls.Add(&second);
//
// Split(tensor, smalls, 1, 2);
//
// XTensor mul;
// mul = Sum(first, second);
//
// XNet net;
// net.Backward(mul);
// net.Dump(stderr);
//
// printf("Done!");
//}
void
MemTest
()
{
XMem
*
mem
;
mem
=
new
XMem
(
0
,
FREE_ON_THE_FLY
,
(
MTYPE
)
MILLION
,
1024
,
MILLION
);
XTensor
tensor
;
InitTensor2D
(
&
tensor
,
2
,
4
,
X_FLOAT
,
0
,
mem
);
tensor
.
SetZeroAll
();
tensor
.
Dump
(
stderr
);
delete
mem
;
if
(
tensor
.
mem
!=
NULL
)
{
printf
(
"It isn't null!
\n
"
);
printf
(
"%d
\n
"
,
(
int
)
tensor
.
mem
->
signature
);
}
else
{
printf
(
"It's null
\n
"
);
}
tensor
.
Dump
(
stderr
);
}
\ No newline at end of file
source/sample/fnnlm/FNNLM.cpp
查看文件 @
1da50ae2
...
...
@@ -415,7 +415,19 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
XNet
autoDiffer
;
double
startT
=
GetClockSec
();
double
mkinput
=
0.0
;
double
mkgold
=
0.0
;
double
train_time
=
0.0
;
double
clearModel
=
0.0
;
double
forward
=
0.0
;
double
backward
=
0.0
;
double
update
=
0.0
;
double
end
=
0.0
;
double
start
=
0.0
;
double
time
;
/* iterate for a number of epochs */
for
(
epoch
=
0
;
epoch
<
nEpoch
;
epoch
++
){
...
...
@@ -426,7 +438,6 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
wordCount
=
0
;
loss
=
0
;
ngramNum
=
1
;
while
(
ngramNum
>
0
){
/* load a minibatch of ngrams */
...
...
@@ -447,20 +458,25 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the loss tensor */
XTensor
lossTensor
;
start
=
GetClockSec
();
/* make the input tensor for position i */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
MakeWordBatch
(
inputs
[
i
],
ngrams
,
ngramNum
,
i
,
model
.
vSize
,
model
.
devID
);
mkinput
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
/* make the gold tensor */
MakeWordBatch
(
gold
,
ngrams
,
ngramNum
,
model
.
n
-
1
,
model
.
vSize
,
model
.
devID
);
mkgold
+=
GetClockSec
()
-
start
;
time
=
GetClockSec
();
if
(
!
autoDiff
){
/* prepare an empty network for building the fnn */
FNNNet
net
;
/* gradident = 0 */
Clear
(
grad
,
false
);
/* forward computation */
Forward
(
inputs
,
output
,
model
,
net
);
...
...
@@ -475,40 +491,60 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
loss
-=
prob
;
}
else
{
start
=
GetClockSec
();
/* gradient = 0 */
Clear
(
model
,
true
);
clearModel
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff
(
ngrams
,
ngramNum
,
output
,
model
);
/* this is implemented by multiply function */
forward
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
/* this is implemented by multiply function */
lossTensor
=
CrossEntropy
(
output
,
gold
);
/* automatic differentiation */
autoDiffer
.
Backward
(
lossTensor
);
backward
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
/* update model parameters */
Update
(
model
,
grad
,
learningRate
,
true
);
update
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
/* get probabilities */
float
prob
=
ReduceSumAll
(
lossTensor
);
loss
+=
prob
;
end
+=
GetClockSec
()
-
start
;
}
train_time
+=
GetClockSec
()
-
time
;
wordCount
+=
ngramNum
;
wordCountTotal
+=
ngramNum
;
if
(
++
step
>=
nStep
){
isEnd
=
true
;
break
;
}
if
(
step
%
100
==
0
)
{
if
(
step
%
100
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
startT
=
GetClockSec
();
XPRINT8
(
0
,
stderr
,
"[Time] mkinput=%.5lfs,mkgold=%.5lfs,train_time=%.5lfs,clearModel=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf, end=%.5lf
\n
"
,
mkinput
,
mkgold
,
train_time
,
clearModel
,
forward
,
backward
,
update
,
end
);
XPRINT5
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f
\n
"
,
elapsed
,
step
,
epoch
+
1
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
mkinput
=
0.0
;
mkgold
=
0.0
;
train_time
=
0.0
;
clearModel
=
0.0
;
forward
=
0.0
;
backward
=
0.0
;
update
=
0.0
;
end
=
0.0
;
}
}
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
1da50ae2
...
...
@@ -148,6 +148,14 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
double
startT
=
GetClockSec
();
double
mkinput
=
0.0
;
double
train_time
=
0.0
;
double
forward
=
0.0
;
double
backward
=
0.0
;
double
update
=
0.0
;
double
start
=
0.0
;
double
time
=
0.0
;
for
(
epoch
=
1
;
epoch
<=
nepoch
;
epoch
++
){
#ifndef WIN32
if
(
isShuffled
)
...
...
@@ -176,18 +184,31 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */
XTensor
goldSmoothed
;
while
(
batchLoader
.
LoadBatch
(
file
,
model
->
isLM
,
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
gold
,
&
label
,
NULL
,
vSize
,
vSizeTgt
,
sBatchSize
,
wBatchSize
,
isLenSorted
,
ws
,
wc
,
devID
,
true
))
{
//while (batchLoader.LoadBatch(file, model->isLM,
// &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
// NULL, vSize, vSizeTgt,
// sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
while
(
true
)
{
start
=
GetClockSec
();
int
batch
=
batchLoader
.
LoadBatch
(
file
,
model
->
isLM
,
&
batchEnc
,
&
paddingEnc
,
&
batchDec
,
&
paddingDec
,
&
gold
,
&
label
,
NULL
,
vSize
,
vSizeTgt
,
sBatchSize
,
wBatchSize
,
isLenSorted
,
ws
,
wc
,
devID
,
true
);
mkinput
+=
GetClockSec
()
-
start
;
if
(
!
batch
)
{
break
;
}
time
=
GetClockSec
();
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
/* output probabilities */
XTensor
output
;
start
=
GetClockSec
();
/* make the network */
if
(
model
->
isLM
)
model
->
MakeLM
(
batchEnc
,
output
,
paddingEnc
,
true
);
...
...
@@ -196,11 +217,12 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
else
{
ShowNTErrors
(
"Illegal model type!"
);
}
forward
+=
GetClockSec
()
-
start
;
/* back-propagation for obtaining gradients */
//if (labelSmoothingP > 0)
// LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
start
=
GetClockSec
();
XTensor
labelOnehot
;
labelOnehot
=
IndexToOnehot
(
label
,
vSizeTgt
,
labelSmoothingP
);
...
...
@@ -229,7 +251,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
net
.
Backward
(
lossTensor
);
//net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
//net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
backward
+=
GetClockSec
()
-
start
;
start
=
GetClockSec
();
gradStep
+=
1
;
loss
+=
prob
;
wordCount
+=
wc
;
...
...
@@ -248,11 +272,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
gradStep
=
0
;
validStep
++
;
update
+=
GetClockSec
()
-
start
;
}
}
else
nSkipped
++
;
train_time
+=
GetClockSec
()
-
time
;
if
(
++
step
>=
nstep
){
isEnd
=
true
;
break
;
...
...
@@ -260,11 +286,19 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if
(
step
%
100
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
startT
=
GetClockSec
();
XPRINT6
(
0
,
stderr
,
"[Time] elapsed=%.5lfs,mkinput=%.5lfs,train_time=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf
\n
"
,
elapsed
,
mkinput
,
train_time
,
forward
,
backward
,
update
);
XPRINT8
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f"
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
wordCountBatch
,
loss
/
wordCount
,
exp
(
loss
/
wordCount
),
exp
(
prob
/
wc
));
if
(
!
doUpdate
)
XPRINT
(
0
,
stderr
,
" (no update)"
);
XPRINT
(
0
,
stderr
,
"
\n
"
);
mkinput
=
0.0
;
train_time
=
0.0
;
forward
=
0.0
;
backward
=
0.0
;
update
=
0.0
;
}
if
(
nStepCheckpoint
>
0
&&
++
nStepCheck
>=
nStepCheckpoint
){
...
...
source/tensor/XTensor.cpp
查看文件 @
1da50ae2
...
...
@@ -25,6 +25,7 @@
* $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
*
*/
#include "halfLib/half/half.hpp"
#include <stdio.h>
#include <stdlib.h>
...
...
@@ -50,6 +51,11 @@
#include "function/Identity.h"
#include "core/CHeader.h"
//#include "halfLib/HalfFloat/umHalf.h"
#ifdef USE_CUDA
// the CUDA stuff
...
...
@@ -376,6 +382,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
XMemCopy
(
data
,
devID
,
tensor
.
data
,
tensor
.
devID
,
size
);
if
(
dataHost
!=
NULL
&&
tensor
.
dataHost
!=
NULL
)
XMemCopy
(
dataHost
,
-
1
,
tensor
.
dataHost
,
tensor
.
devID
,
size
);
XMemCopy
(
dataHost
,
-
1
,
tensor
.
dataHost
,
tensor
.
devID
,
size
);
}
else
{
DestroyData
();
...
...
@@ -1854,6 +1861,16 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
}
}
else
if
(
dataType
==
X_FLOAT16
)
{
int
end
=
MIN
(
n
>
0
?
beg
+
n
:
beg
+
unitNum
,
unitNum
);
for
(
int
i
=
beg
;
i
<
end
;
i
++
)
{
halfCPU
f
=
((
halfCPU
*
)
d
)[
i
];
if
(
i
==
beg
)
fprintf
(
file
,
"%hx"
,
f
);
else
fprintf
(
file
,
" %hx"
,
f
);
}
}
else
if
(
dataType
==
X_INT
)
{
int
end
=
MIN
(
n
>
0
?
beg
+
n
:
beg
+
unitNum
,
unitNum
);
for
(
int
i
=
beg
;
i
<
end
;
i
++
){
...
...
@@ -1900,9 +1917,22 @@ dump data to a file
*/
void
XTensor
::
Dump
(
const
XTensor
*
tensor
,
FILE
*
file
,
const
char
*
label
,
const
int
n
,
const
int
beg
,
const
int
verbose
)
{
XTensor
a
(
tensor
->
order
,
tensor
->
dimSize
,
tensor
->
dataType
,
tensor
->
denseRatio
,
tensor
->
devID
,
tensor
->
mem
);
_CopyValues
(
tensor
,
&
a
);
a
.
Dump
(
file
,
label
,
n
,
beg
,
verbose
);
if
(
tensor
->
dataType
==
X_FLOAT
)
{
XTensor
a
(
tensor
->
order
,
tensor
->
dimSize
,
tensor
->
dataType
,
tensor
->
denseRatio
,
tensor
->
devID
,
tensor
->
mem
);
_CopyValues
(
tensor
,
&
a
);
a
.
Dump
(
file
,
label
,
n
,
beg
,
verbose
);
}
else
if
(
tensor
->
dataType
==
X_FLOAT16
)
{
XTensor
a
(
tensor
->
order
,
tensor
->
dimSize
,
X_FLOAT
,
tensor
->
denseRatio
,
tensor
->
devID
,
tensor
->
mem
);
_ConvertDataType
(
tensor
,
&
a
);
a
.
Dump
(
file
,
label
,
n
,
beg
,
verbose
);
}
else
{
ShowNTErrors
(
"TO DO!"
);
}
}
/*
...
...
@@ -1980,6 +2010,14 @@ void XTensor::Read(FILE * file, const char * label)
}
}
}
else
if
(
dataType
==
X_FLOAT16
){
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
{
halfCPU
*
f
=
((
halfCPU
*
)
data
)
+
i
;
if
(
fscanf
(
file
,
"%hx"
,
f
)
<
1
)
{
ShowNTErrors
(
"Incorrect tensor format!"
);
}
}
}
else
{
ShowNTErrors
(
"TODO!"
);
}
...
...
@@ -2006,15 +2044,13 @@ void XTensor::Read(FILE * file, const char * label)
}
}
do
{
c
=
fgetc
(
file
);
}
while
(
c
!=
'\n'
&&
c
!=
EOF
);
XMemCopy
(
dataBackup
,
devID
,
data
,
-
1
,
GetDataSizeInChar
());
data
=
dataBackup
;
delete
[](
char
*
)
dataBuf
;
delete
[](
char
*
)
dataBuf
;
}
/*
...
...
source/tensor/core/utilities/FlushToMem.cu
查看文件 @
1da50ae2
...
...
@@ -97,7 +97,7 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
/* copy the data from GPU memory to CPU memory */
void CudaGPUToCPUFlush(XTensor * tensor)
{
CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
//
CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
if (tensor->dataHost != NULL)
delete[](char*)tensor->dataHost;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论