Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
T
Tensor.LowPrecision
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
linye
Tensor.LowPrecision
Commits
2c4061e9
Commit
2c4061e9
authored
Jul 30, 2019
by
ltb
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed FNNLM of branch of xiao
parent
3800528b
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
1412 行增加
和
1406 行删除
+1412
-1406
source/network/Main.cpp
+0
-1
source/sample/fnnlm/FNNLM.cpp
+1182
-1179
source/tensor/Main.cpp
+219
-215
source/tensor/loss/CrossEntropy.cu
+11
-11
没有找到文件。
source/network/Main.cpp
查看文件 @
2c4061e9
...
...
@@ -24,7 +24,6 @@
#include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
...
...
source/sample/fnnlm/FNNLM.cpp
查看文件 @
2c4061e9
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
...
...
@@ -15,15 +15,15 @@
* limitations under the License.
*/
/*
*
* This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in
* "A Neural Probabilistic Language Model" by Bengio et al.
* Journal of Machine Learning Research 3 (2003) 1137C1155
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
*/
/*
*
* This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in
* "A Neural Probabilistic Language Model" by Bengio et al.
* Journal of Machine Learning Research 3 (2003) 1137C1155
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
*/
#include <math.h>
#include "FNNLM.h"
...
...
@@ -32,6 +32,7 @@
#include "../../tensor/XDevice.h"
#include "../../tensor/function/FHeader.h"
#include "../../network/XNet.h"
#include "../../tensor/core/math/ScaleAndShift.h"
namespace
fnnlm
{
...
...
@@ -39,1185 +40,1187 @@ namespace fnnlm
#define MAX_NAME_LENGTH 1024
#define MAX_LINE_LENGTH_HERE 1024 * 32
char
trainFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the training data
char
modelFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the FNN model
char
testFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the test data
char
outputFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the result data
float
learningRate
=
0.01
F
;
// learning rate
int
nStep
=
10000000
;
// max learning steps (or model updates)
int
nEpoch
=
10
;
// max training epochs
float
minmax
=
0.08
F
;
// range [-p,p] for parameter initialization
int
sentBatch
=
0
;
// batch size at the sentence level
int
wordBatch
=
1
;
// batch size at the word level
bool
shuffled
=
false
;
// shuffled the training data file or not
bool
autoDiff
=
false
;
// indicator of automatic differentiation
void
LoadArgs
(
int
argc
,
const
char
**
argv
,
FNNModel
&
model
);
void
Init
(
FNNModel
&
model
);
void
Check
(
FNNModel
&
model
);
void
Copy
(
FNNModel
&
tgt
,
FNNModel
&
src
);
void
Clear
(
FNNModel
&
model
,
bool
isNodeGrad
);
void
InitModelTensor1D
(
XTensor
&
tensor
,
int
num
,
FNNModel
&
model
);
void
InitModelTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
FNNModel
&
model
);
void
Train
(
const
char
*
train
,
bool
isShuffled
,
FNNModel
&
model
);
void
Update
(
FNNModel
&
model
,
FNNModel
&
grad
,
float
epsilon
,
bool
isNodeGrad
);
float
GetProb
(
XTensor
&
output
,
XTensor
&
gold
,
XTensor
*
wordProbs
=
NULL
);
void
Dump
(
const
char
*
fn
,
FNNModel
&
model
);
void
Read
(
const
char
*
fn
,
FNNModel
&
model
);
void
Test
(
const
char
*
test
,
const
char
*
result
,
FNNModel
&
model
);
int
LoadNGrams
(
FILE
*
file
,
int
n
,
NGram
*
ngrams
,
int
sentNum
,
int
wordNum
);
void
InitZeroOneTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
int
*
rows
,
int
*
cols
,
int
itemNum
,
int
devID
,
XMem
*
mem
);
void
MakeWordBatch
(
XTensor
&
batch
,
NGram
*
ngrams
,
int
ngramNum
,
int
n
,
int
vSize
,
int
devID
,
XMem
*
mem
);
void
Forward
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
,
FNNNet
&
net
);
void
Backward
(
XTensor
inputs
[],
XTensor
&
output
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
,
FNNModel
&
model
,
FNNModel
&
grad
,
FNNNet
&
net
);
void
ForwardAutoDiff
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
);
void
ForwardAutoDiff
(
NGram
*
ngrams
,
int
batch
,
XTensor
&
output
,
FNNModel
&
model
);
/*
entry of the program
>> argc - number of the arguments
>> argv - pointers to the arguments
<< return - error code
arguments:
-train S: specify training data file name
-model S: specify model file name
-test S: specify test data file name
-output S: specify result data file name
-n D: order of the language model
-eSize D: embedding size
-vSize D: vocabulary size
-hdepth D: number of stacked hidden layers
-hsize D: size of each hidden layer
-lrate F: learning rate
-nstep D: maximum number of model updates
-nepoch D: maximum number of training epochs
-batch D: batch size (how many sentences)
-wbatch D: batch size at the word level
(how many words)
-shuffle: shuffle the training data
-devid D: the id of the device used
-1: CPU, >=0: GPUs
-mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training
where S=string, D=integer and F=float.
All words in the training and test data files
are encoded as thire indeces in the vocabulary.
E.g.,
0 29 2 11 1
might be a line of the file.
*/
int
FNNLMMain
(
int
argc
,
const
char
**
argv
)
{
if
(
argc
==
0
)
return
1
;
FNNModel
model
;
/* load arguments */
LoadArgs
(
argc
,
argv
,
model
);
/* check the setting */
Check
(
model
);
/* initialize model parameters */
Init
(
model
);
/* learn model parameters */
if
(
strcmp
(
trainFN
,
""
))
Train
(
trainFN
,
shuffled
,
model
);
/* save the final model */
if
(
strcmp
(
modelFN
,
""
)
&&
strcmp
(
trainFN
,
""
))
Dump
(
modelFN
,
model
);
/* load the model if neccessary */
if
(
strcmp
(
modelFN
,
""
))
Read
(
modelFN
,
model
);
/* test the model on the new data */
if
(
strcmp
(
testFN
,
""
)
&&
strcmp
(
outputFN
,
""
))
Test
(
testFN
,
outputFN
,
model
);
return
0
;
}
/*
load arguments
>> argc - number of the arguments
>> argv - pointers to the arguments
>> model - the fnn model
*/
void
LoadArgs
(
int
argc
,
const
char
**
argv
,
FNNModel
&
model
)
{
fprintf
(
stderr
,
"args:
\n
"
);
for
(
int
i
=
0
;
i
<
argc
;
i
++
){
if
(
!
strcmp
(
argv
[
i
],
"-train"
)
&&
i
+
1
<
argc
){
strcpy
(
trainFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -train=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-model"
)
&&
i
+
1
<
argc
){
strcpy
(
modelFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -model=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-test"
)
&&
i
+
1
<
argc
){
strcpy
(
testFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -test=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-output"
)
&&
i
+
1
<
argc
){
strcpy
(
outputFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -output=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-n"
)
&&
i
+
1
<
argc
){
model
.
n
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -n=%d
\n
"
,
model
.
n
);
}
if
(
!
strcmp
(
argv
[
i
],
"-esize"
)
&&
i
+
1
<
argc
){
model
.
eSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -esize=%d
\n
"
,
model
.
eSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-vsize"
)
&&
i
+
1
<
argc
){
model
.
vSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -vsize=%d
\n
"
,
model
.
vSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hdepth"
)
&&
i
+
1
<
argc
){
model
.
hDepth
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -hdepth=%d
\n
"
,
model
.
hDepth
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hsize"
)
&&
i
+
1
<
argc
){
model
.
hSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -hsize=%d
\n
"
,
model
.
hSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-lrate"
)
&&
i
+
1
<
argc
){
learningRate
=
(
float
)
atof
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -lrate=%f
\n
"
,
learningRate
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nstep"
)
&&
i
+
1
<
argc
){
nStep
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -nstep=%d
\n
"
,
nStep
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nepoch"
)
&&
i
+
1
<
argc
){
nEpoch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -nepoch=%d
\n
"
,
nEpoch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-minmax"
)
&&
i
+
1
<
argc
){
minmax
=
(
float
)
fabs
(
atof
(
argv
[
i
+
1
]));
fprintf
(
stderr
,
" -minmax=%f
\n
"
,
minmax
);
}
if
(
!
strcmp
(
argv
[
i
],
"-batch"
)
&&
i
+
1
<
argc
){
sentBatch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -batch=%d
\n
"
,
sentBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-wbatch"
)
&&
i
+
1
<
argc
){
wordBatch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -wbatch=%d
\n
"
,
wordBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-shuffle"
)){
shuffled
=
true
;
fprintf
(
stderr
,
" -shuffle=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-autodiff"
)){
autoDiff
=
true
;
fprintf
(
stderr
,
" -autodiff=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-dev"
)
&&
i
+
1
<
argc
){
model
.
devID
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -dev=%d
\n
"
,
model
.
devID
);
}
}
for
(
int
i
=
0
;
i
<
argc
;
i
++
){
if
(
!
strcmp
(
argv
[
i
],
"-mempool"
))
model
.
mem
=
new
XMem
(
model
.
devID
);
}
}
/* check model settings */
void
Check
(
FNNModel
&
model
)
{
CheckErrors
(
model
.
n
>
0
&&
model
.
n
<=
MAX_N_GRAM
,
"The LM order is out of range (use -n)!"
);
CheckErrors
(
model
.
vSize
>
0
,
"no vocabulary size found (use -vsize)!"
);
CheckErrors
(
model
.
eSize
>
0
,
"no embedding size found (use -esize)!"
);
}
/* make a hard copy of the fnn model */
void
Copy
(
FNNModel
&
tgt
,
FNNModel
&
src
)
{
InitTensorV2
(
&
tgt
.
embeddingW
,
&
src
.
embeddingW
);
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
){
InitTensorV2
(
&
tgt
.
hiddenW
[
i
],
&
src
.
hiddenW
[
i
]);
InitTensorV2
(
&
tgt
.
hiddenB
[
i
],
&
src
.
hiddenB
[
i
]);
}
InitTensorV2
(
&
tgt
.
outputW
,
&
src
.
outputW
);
InitTensorV2
(
&
tgt
.
outputB
,
&
src
.
outputB
);
tgt
.
n
=
src
.
n
;
tgt
.
eSize
=
src
.
eSize
;
tgt
.
hDepth
=
src
.
hDepth
;
tgt
.
hSize
=
src
.
hSize
;
tgt
.
vSize
=
src
.
vSize
;
tgt
.
devID
=
src
.
devID
;
tgt
.
useMemPool
=
src
.
useMemPool
;
if
(
src
.
mem
!=
NULL
){
tgt
.
mem
=
new
XMem
(
src
.
mem
->
devID
,
src
.
mem
->
mode
,
src
.
mem
->
maxBlockSize
,
src
.
mem
->
blockNum
,
src
.
mem
->
bufSize
);
}
}
/*
reset model parameters
>> model - the model whose parameter (gradient) is set to 0
>> isNodeGrad - indicates whether the tensor node keeps the
gradient information
*/
void
Clear
(
FNNModel
&
model
,
bool
isNodeGrad
)
{
if
(
isNodeGrad
)
{
if
(
model
.
embeddingW
.
grad
!=
NULL
)
model
.
embeddingW
.
grad
->
SetZeroAll
();
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
)
{
if
(
model
.
hiddenW
[
i
].
grad
!=
NULL
)
model
.
hiddenW
[
i
].
grad
->
SetZeroAll
();
if
(
model
.
hiddenB
[
i
].
grad
!=
NULL
)
model
.
hiddenB
[
i
].
grad
->
SetZeroAll
();
}
if
(
model
.
outputW
.
grad
!=
NULL
)
model
.
outputW
.
grad
->
SetZeroAll
();
if
(
model
.
outputB
.
grad
!=
NULL
)
model
.
outputB
.
grad
->
SetZeroAll
();
}
else
{
model
.
embeddingW
.
SetZeroAll
();
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
)
{
model
.
hiddenW
[
i
].
SetZeroAll
();
model
.
hiddenB
[
i
].
SetZeroAll
();
}
model
.
outputW
.
SetZeroAll
();
model
.
outputB
.
SetZeroAll
();
}
}
/*
initialize a 1d tensor using the fnn model setting
>> tensor - the tensor to initialize
>> num - number of items
>> model - the fnn model
*/
void
InitModelTensor1D
(
XTensor
&
tensor
,
int
num
,
FNNModel
&
model
)
{
InitTensor1DV2
(
&
tensor
,
num
,
X_FLOAT
,
model
.
devID
);
}
/*
initialize a 2d tensor using the fnn model setting
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> model - the fnn model
*/
void
InitModelTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
FNNModel
&
model
)
{
InitTensor2DV2
(
&
tensor
,
rowNum
,
colNum
,
X_FLOAT
,
model
.
devID
);
}
/* initialize the model */
void
Init
(
FNNModel
&
model
)
{
/* create embedding parameter matrix: vSize * eSize */
InitModelTensor2D
(
model
.
embeddingW
,
model
.
vSize
,
model
.
eSize
,
model
);
/* create hidden layer parameter matrics */
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
){
/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
hsize * hsize otherwise */
if
(
i
==
0
)
InitModelTensor2D
(
model
.
hiddenW
[
i
],
(
model
.
n
-
1
)
*
model
.
eSize
,
model
.
hSize
,
model
);
else
InitModelTensor2D
(
model
.
hiddenW
[
i
],
model
.
hSize
,
model
.
hSize
,
model
);
/* bias term: a row vector of hSize entries */
InitModelTensor1D
(
model
.
hiddenB
[
i
],
model
.
hSize
,
model
);
}
/* create the output layer parameter matrix and bias term */
int
iSize
=
model
.
hDepth
==
0
?
(
model
.
n
-
1
)
*
model
.
eSize
:
model
.
hSize
;
InitModelTensor2D
(
model
.
outputW
,
iSize
,
model
.
vSize
,
model
);
InitModelTensor1D
(
model
.
outputB
,
model
.
vSize
,
model
);
/* then, we initialize model parameters using a uniform distribution in range
of [-minmax, minmax] */
model
.
embeddingW
.
SetDataRand
(
-
minmax
,
minmax
);
model
.
outputW
.
SetDataRand
(
-
minmax
,
minmax
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
model
.
hiddenW
[
i
].
SetDataRand
(
-
minmax
,
minmax
);
/* all bias terms are set to zero */
model
.
outputB
.
SetZeroAll
();
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
model
.
hiddenB
[
i
].
SetZeroAll
();
}
/*
shuffle lines of the file
>> srcFile - the source file to shuffle
>> tgtFile - the resulting file
*/
void
Shuffle
(
const
char
*
srcFile
,
const
char
*
tgtFile
)
{
char
*
line
=
new
char
[
MAX_LINE_LENGTH_HERE
];
char
trainFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the training data
char
modelFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the FNN model
char
testFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the test data
char
outputFN
[
MAX_NAME_LENGTH
]
=
""
;
// file name of the result data
float
learningRate
=
0.01
F
;
// learning rate
int
nStep
=
10000000
;
// max learning steps (or model updates)
int
nEpoch
=
10
;
// max training epochs
float
minmax
=
0.08
F
;
// range [-p,p] for parameter initialization
int
sentBatch
=
0
;
// batch size at the sentence level
int
wordBatch
=
1
;
// batch size at the word level
bool
shuffled
=
false
;
// shuffled the training data file or not
bool
autoDiff
=
false
;
// indicator of automatic differentiation
void
LoadArgs
(
int
argc
,
const
char
**
argv
,
FNNModel
&
model
);
void
Init
(
FNNModel
&
model
);
void
Check
(
FNNModel
&
model
);
void
Copy
(
FNNModel
&
tgt
,
FNNModel
&
src
);
void
Clear
(
FNNModel
&
model
,
bool
isNodeGrad
);
void
InitModelTensor1D
(
XTensor
&
tensor
,
int
num
,
FNNModel
&
model
);
void
InitModelTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
FNNModel
&
model
);
void
Train
(
const
char
*
train
,
bool
isShuffled
,
FNNModel
&
model
);
void
Update
(
FNNModel
&
model
,
FNNModel
&
grad
,
float
epsilon
,
bool
isNodeGrad
);
float
GetProb
(
XTensor
&
output
,
XTensor
&
gold
,
XTensor
*
wordProbs
=
NULL
);
void
Dump
(
const
char
*
fn
,
FNNModel
&
model
);
void
Read
(
const
char
*
fn
,
FNNModel
&
model
);
void
Test
(
const
char
*
test
,
const
char
*
result
,
FNNModel
&
model
);
int
LoadNGrams
(
FILE
*
file
,
int
n
,
NGram
*
ngrams
,
int
sentNum
,
int
wordNum
);
void
InitZeroOneTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
int
*
rows
,
int
*
cols
,
int
itemNum
,
int
devID
,
XMem
*
mem
);
void
MakeWordBatch
(
XTensor
&
batch
,
NGram
*
ngrams
,
int
ngramNum
,
int
n
,
int
vSize
,
int
devID
,
XMem
*
mem
);
void
Forward
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
,
FNNNet
&
net
);
void
Backward
(
XTensor
inputs
[],
XTensor
&
output
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
,
FNNModel
&
model
,
FNNModel
&
grad
,
FNNNet
&
net
);
void
ForwardAutoDiff
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
);
void
ForwardAutoDiff
(
NGram
*
ngrams
,
int
batch
,
XTensor
&
output
,
FNNModel
&
model
);
/*
entry of the program
>> argc - number of the arguments
>> argv - pointers to the arguments
<< return - error code
arguments:
-train S: specify training data file name
-model S: specify model file name
-test S: specify test data file name
-output S: specify result data file name
-n D: order of the language model
-eSize D: embedding size
-vSize D: vocabulary size
-hdepth D: number of stacked hidden layers
-hsize D: size of each hidden layer
-lrate F: learning rate
-nstep D: maximum number of model updates
-nepoch D: maximum number of training epochs
-batch D: batch size (how many sentences)
-wbatch D: batch size at the word level
(how many words)
-shuffle: shuffle the training data
-devid D: the id of the device used
-1: CPU, >=0: GPUs
-mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training
where S=string, D=integer and F=float.
All words in the training and test data files
are encoded as thire indeces in the vocabulary.
E.g.,
0 29 2 11 1
might be a line of the file.
*/
int
FNNLMMain
(
int
argc
,
const
char
**
argv
)
{
if
(
argc
==
0
)
return
1
;
FNNModel
model
;
/* load arguments */
LoadArgs
(
argc
,
argv
,
model
);
/* check the setting */
Check
(
model
);
/* initialize model parameters */
Init
(
model
);
/* learn model parameters */
if
(
strcmp
(
trainFN
,
""
))
Train
(
trainFN
,
shuffled
,
model
);
/* save the final model */
if
(
strcmp
(
modelFN
,
""
)
&&
strcmp
(
trainFN
,
""
))
Dump
(
modelFN
,
model
);
/* load the model if neccessary */
if
(
strcmp
(
modelFN
,
""
))
Read
(
modelFN
,
model
);
/* test the model on the new data */
if
(
strcmp
(
testFN
,
""
)
&&
strcmp
(
outputFN
,
""
))
Test
(
testFN
,
outputFN
,
model
);
return
0
;
}
/*
load arguments
>> argc - number of the arguments
>> argv - pointers to the arguments
>> model - the fnn model
*/
void
LoadArgs
(
int
argc
,
const
char
**
argv
,
FNNModel
&
model
)
{
fprintf
(
stderr
,
"args:
\n
"
);
for
(
int
i
=
0
;
i
<
argc
;
i
++
)
{
if
(
!
strcmp
(
argv
[
i
],
"-train"
)
&&
i
+
1
<
argc
)
{
strcpy
(
trainFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -train=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-model"
)
&&
i
+
1
<
argc
)
{
strcpy
(
modelFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -model=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-test"
)
&&
i
+
1
<
argc
)
{
strcpy
(
testFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -test=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-output"
)
&&
i
+
1
<
argc
)
{
strcpy
(
outputFN
,
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -output=%s
\n
"
,
argv
[
i
+
1
]);
}
if
(
!
strcmp
(
argv
[
i
],
"-n"
)
&&
i
+
1
<
argc
)
{
model
.
n
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -n=%d
\n
"
,
model
.
n
);
}
if
(
!
strcmp
(
argv
[
i
],
"-esize"
)
&&
i
+
1
<
argc
)
{
model
.
eSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -esize=%d
\n
"
,
model
.
eSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-vsize"
)
&&
i
+
1
<
argc
)
{
model
.
vSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -vsize=%d
\n
"
,
model
.
vSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hdepth"
)
&&
i
+
1
<
argc
)
{
model
.
hDepth
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -hdepth=%d
\n
"
,
model
.
hDepth
);
}
if
(
!
strcmp
(
argv
[
i
],
"-hsize"
)
&&
i
+
1
<
argc
)
{
model
.
hSize
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -hsize=%d
\n
"
,
model
.
hSize
);
}
if
(
!
strcmp
(
argv
[
i
],
"-lrate"
)
&&
i
+
1
<
argc
)
{
learningRate
=
(
float
)
atof
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -lrate=%f
\n
"
,
learningRate
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nstep"
)
&&
i
+
1
<
argc
)
{
nStep
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -nstep=%d
\n
"
,
nStep
);
}
if
(
!
strcmp
(
argv
[
i
],
"-nepoch"
)
&&
i
+
1
<
argc
)
{
nEpoch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -nepoch=%d
\n
"
,
nEpoch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-minmax"
)
&&
i
+
1
<
argc
)
{
minmax
=
(
float
)
fabs
(
atof
(
argv
[
i
+
1
]));
fprintf
(
stderr
,
" -minmax=%f
\n
"
,
minmax
);
}
if
(
!
strcmp
(
argv
[
i
],
"-batch"
)
&&
i
+
1
<
argc
)
{
sentBatch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -batch=%d
\n
"
,
sentBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-wbatch"
)
&&
i
+
1
<
argc
)
{
wordBatch
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -wbatch=%d
\n
"
,
wordBatch
);
}
if
(
!
strcmp
(
argv
[
i
],
"-shuffle"
))
{
shuffled
=
true
;
fprintf
(
stderr
,
" -shuffle=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-autodiff"
))
{
autoDiff
=
true
;
fprintf
(
stderr
,
" -autodiff=true
\n
"
);
}
if
(
!
strcmp
(
argv
[
i
],
"-dev"
)
&&
i
+
1
<
argc
)
{
model
.
devID
=
atoi
(
argv
[
i
+
1
]);
fprintf
(
stderr
,
" -dev=%d
\n
"
,
model
.
devID
);
}
}
for
(
int
i
=
0
;
i
<
argc
;
i
++
)
{
if
(
!
strcmp
(
argv
[
i
],
"-mempool"
))
model
.
mem
=
new
XMem
(
model
.
devID
);
}
}
/* check model settings */
void
Check
(
FNNModel
&
model
)
{
CheckErrors
(
model
.
n
>
0
&&
model
.
n
<=
MAX_N_GRAM
,
"The LM order is out of range (use -n)!"
);
CheckErrors
(
model
.
vSize
>
0
,
"no vocabulary size found (use -vsize)!"
);
CheckErrors
(
model
.
eSize
>
0
,
"no embedding size found (use -esize)!"
);
}
/* make a hard copy of the fnn model */
void
Copy
(
FNNModel
&
tgt
,
FNNModel
&
src
)
{
InitTensorV2
(
&
tgt
.
embeddingW
,
&
src
.
embeddingW
);
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
)
{
InitTensorV2
(
&
tgt
.
hiddenW
[
i
],
&
src
.
hiddenW
[
i
]);
InitTensorV2
(
&
tgt
.
hiddenB
[
i
],
&
src
.
hiddenB
[
i
]);
}
InitTensorV2
(
&
tgt
.
outputW
,
&
src
.
outputW
);
InitTensorV2
(
&
tgt
.
outputB
,
&
src
.
outputB
);
tgt
.
n
=
src
.
n
;
tgt
.
eSize
=
src
.
eSize
;
tgt
.
hDepth
=
src
.
hDepth
;
tgt
.
hSize
=
src
.
hSize
;
tgt
.
vSize
=
src
.
vSize
;
tgt
.
devID
=
src
.
devID
;
tgt
.
useMemPool
=
src
.
useMemPool
;
if
(
src
.
mem
!=
NULL
)
{
tgt
.
mem
=
new
XMem
(
src
.
mem
->
devID
,
src
.
mem
->
mode
,
src
.
mem
->
maxBlockSize
,
src
.
mem
->
blockNum
,
src
.
mem
->
bufSize
);
}
}
/*
reset model parameters
>> model - the model whose parameter (gradient) is set to 0
>> isNodeGrad - indicates whether the tensor node keeps the
gradient information
*/
void
Clear
(
FNNModel
&
model
,
bool
isNodeGrad
)
{
if
(
isNodeGrad
)
{
if
(
model
.
embeddingW
.
grad
!=
NULL
)
model
.
embeddingW
.
grad
->
SetZeroAll
();
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
)
{
if
(
model
.
hiddenW
[
i
].
grad
!=
NULL
)
model
.
hiddenW
[
i
].
grad
->
SetZeroAll
();
if
(
model
.
hiddenB
[
i
].
grad
!=
NULL
)
model
.
hiddenB
[
i
].
grad
->
SetZeroAll
();
}
if
(
model
.
outputW
.
grad
!=
NULL
)
model
.
outputW
.
grad
->
SetZeroAll
();
if
(
model
.
outputB
.
grad
!=
NULL
)
model
.
outputB
.
grad
->
SetZeroAll
();
}
else
{
model
.
embeddingW
.
SetZeroAll
();
for
(
int
i
=
0
;
i
<
MAX_HIDDEN_NUM
;
i
++
)
{
model
.
hiddenW
[
i
].
SetZeroAll
();
model
.
hiddenB
[
i
].
SetZeroAll
();
}
model
.
outputW
.
SetZeroAll
();
model
.
outputB
.
SetZeroAll
();
}
}
/*
initialize a 1d tensor using the fnn model setting
>> tensor - the tensor to initialize
>> num - number of items
>> model - the fnn model
*/
void
InitModelTensor1D
(
XTensor
&
tensor
,
int
num
,
FNNModel
&
model
)
{
InitTensor1DV2
(
&
tensor
,
num
,
X_FLOAT
,
model
.
devID
);
}
/*
initialize a 2d tensor using the fnn model setting
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> model - the fnn model
*/
void
InitModelTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
FNNModel
&
model
)
{
InitTensor2DV2
(
&
tensor
,
rowNum
,
colNum
,
X_FLOAT
,
model
.
devID
);
}
/* initialize the model */
void
Init
(
FNNModel
&
model
)
{
/* create embedding parameter matrix: vSize * eSize */
InitModelTensor2D
(
model
.
embeddingW
,
model
.
vSize
,
model
.
eSize
,
model
);
model
.
embeddingW
.
SetVarFlag
();
/* create hidden layer parameter matrics */
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
hsize * hsize otherwise */
if
(
i
==
0
)
InitModelTensor2D
(
model
.
hiddenW
[
i
],
(
model
.
n
-
1
)
*
model
.
eSize
,
model
.
hSize
,
model
);
else
InitModelTensor2D
(
model
.
hiddenW
[
i
],
model
.
hSize
,
model
.
hSize
,
model
);
model
.
hiddenW
[
i
].
SetVarFlag
();
/* bias term: a row vector of hSize entries */
InitModelTensor1D
(
model
.
hiddenB
[
i
],
model
.
hSize
,
model
);
model
.
hiddenB
[
i
].
SetVarFlag
();
}
/* create the output layer parameter matrix and bias term */
int
iSize
=
model
.
hDepth
==
0
?
(
model
.
n
-
1
)
*
model
.
eSize
:
model
.
hSize
;
InitModelTensor2D
(
model
.
outputW
,
iSize
,
model
.
vSize
,
model
);
InitModelTensor1D
(
model
.
outputB
,
model
.
vSize
,
model
);
model
.
outputW
.
SetVarFlag
();
model
.
outputB
.
SetVarFlag
();
/* then, we initialize model parameters using a uniform distribution in range
of [-minmax, minmax] */
model
.
embeddingW
.
SetDataRand
(
-
minmax
,
minmax
);
model
.
outputW
.
SetDataRand
(
-
minmax
,
minmax
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
model
.
hiddenW
[
i
].
SetDataRand
(
-
minmax
,
minmax
);
/* all bias terms are set to zero */
model
.
outputB
.
SetZeroAll
();
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
model
.
hiddenB
[
i
].
SetZeroAll
();
}
/*
shuffle lines of the file
>> srcFile - the source file to shuffle
>> tgtFile - the resulting file
*/
void
Shuffle
(
const
char
*
srcFile
,
const
char
*
tgtFile
)
{
char
*
line
=
new
char
[
MAX_LINE_LENGTH_HERE
];
#ifndef WIN32
sprintf
(
line
,
"shuf %s > %s"
,
srcFile
,
tgtFile
);
system
(
line
);
sprintf
(
line
,
"shuf %s > %s"
,
srcFile
,
tgtFile
);
system
(
line
);
#else
ShowErrors
(
"Cannot shuffle the file on WINDOWS systems!"
);
ShowErrors
(
"Cannot shuffle the file on WINDOWS systems!"
);
#endif
delete
[]
line
;
}
char
lineBuf
[
MAX_LINE_LENGTH_HERE
];
int
wordBuf
[
MAX_LINE_LENGTH_HERE
];
/*
train the model with the standard SGD method
>> train - training data file
>> isShuffled - shuffle the data file or not
>> model - the fnn model
*/
void
Train
(
const
char
*
train
,
bool
isShuffled
,
FNNModel
&
model
)
{
char
name
[
MAX_NAME_LENGTH
];
/* shuffle the data */
if
(
isShuffled
){
sprintf
(
name
,
"%s-tmp"
,
train
);
Shuffle
(
train
,
name
);
}
else
strcpy
(
name
,
train
);
int
epoch
=
0
;
int
step
=
0
;
int
wordCount
=
0
;
int
wordCountTotal
=
0
;
int
ngramNum
=
1
;
float
loss
=
0
;
bool
isEnd
=
false
;
NGram
*
ngrams
=
new
NGram
[
MAX_LINE_LENGTH_HERE
];
/* make a model to keep gradients */
FNNModel
grad
;
Copy
(
grad
,
model
);
/* XNet for automatic differentiation */
XNet
autoDiffer
;
double
startT
=
GetClockSec
();
/* iterate for a number of epochs */
for
(
epoch
=
0
;
epoch
<
nEpoch
;
epoch
++
){
/* data file */
FILE
*
file
=
fopen
(
name
,
"rb"
);
CheckErrors
(
file
,
"Cannot open the training file"
);
wordCount
=
0
;
loss
=
0
;
ngramNum
=
1
;
while
(
ngramNum
>
0
){
/* load a minibatch of ngrams */
ngramNum
=
LoadNGrams
(
file
,
model
.
n
,
ngrams
,
sentBatch
,
wordBatch
);
if
(
ngramNum
<=
0
)
break
;
/* previous n - 1 words */
XTensor
inputs
[
MAX_N_GRAM
];
/* the predicted word */
XTensor
output
;
/* the gold standard */
XTensor
gold
;
/* the loss tensor */
XTensor
lossTensor
;
/* make the input tensor for position i */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
MakeWordBatch
(
inputs
[
i
],
ngrams
,
ngramNum
,
i
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
/* make the gold tensor */
MakeWordBatch
(
gold
,
ngrams
,
ngramNum
,
model
.
n
-
1
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
if
(
!
autoDiff
){
/* prepare an empty network for building the fnn */
FNNNet
net
;
/* gradident = 0 */
Clear
(
grad
,
false
);
/* forward computation */
Forward
(
inputs
,
output
,
model
,
net
);
/* backward computation to obtain gradients */
Backward
(
inputs
,
output
,
gold
,
CROSSENTROPY
,
model
,
grad
,
net
);
/* update model parameters */
Update
(
model
,
grad
,
learningRate
,
false
);
}
else
{
/* gradient = 0 */
Clear
(
model
,
true
);
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff
(
ngrams
,
ngramNum
,
output
,
model
);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
lossTensor
=
CrossEntropy
(
output
,
gold
);
/* automatic differentiation */
autoDiffer
.
Backward
(
lossTensor
);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update
(
model
,
grad
,
learningRate
,
true
);
}
/* get probabilities */
float
prob
=
GetProb
(
output
,
gold
);
prob
=
ReduceSumAll
(
lossTensor
);
loss
+=
prob
;
wordCount
+=
ngramNum
;
wordCountTotal
+=
ngramNum
;
if
(
++
step
>=
nStep
){
isEnd
=
true
;
break
;
}
if
(
step
%
100
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT5
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f
\n
"
,
elapsed
,
step
,
epoch
+
1
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
}
}
fclose
(
file
);
if
(
isEnd
)
break
;
Test
(
testFN
,
outputFN
,
model
);
}
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT5
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f
\n
"
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] training finished (took %.1fs, step=%d and epoch=%d)
\n
"
,
elapsed
,
step
,
epoch
);
delete
[]
ngrams
;
}
/*
update the model parameters using the delta rule
>> model - the model to update
>> grad - gradients
>> epsilon - learning rate
>> isNodeGrad - indicates whether the gradient is associated with the node
*/
void
Update
(
FNNModel
&
model
,
FNNModel
&
grad
,
float
epsilon
,
bool
isNodeGrad
)
{
TensorList
paraList
(
10
);
TensorList
gradList
(
10
);
paraList
.
Add
(
&
model
.
outputW
);
paraList
.
Add
(
&
model
.
outputB
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
paraList
.
Add
(
&
model
.
hiddenW
[
i
]);
paraList
.
Add
(
&
model
.
hiddenB
[
i
]);
}
paraList
.
Add
(
&
model
.
embeddingW
);
if
(
!
isNodeGrad
){
gradList
.
Add
(
&
grad
.
outputW
);
gradList
.
Add
(
&
grad
.
outputB
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
gradList
.
Add
(
&
grad
.
hiddenW
[
i
]);
gradList
.
Add
(
&
grad
.
hiddenB
[
i
]);
}
;
gradList
.
Add
(
&
grad
.
embeddingW
);
}
else
{
gradList
.
Add
(
model
.
outputW
.
grad
);
gradList
.
Add
(
model
.
outputB
.
grad
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
gradList
.
Add
(
model
.
hiddenW
[
i
].
grad
);
gradList
.
Add
(
model
.
hiddenB
[
i
].
grad
);
}
gradList
.
Add
(
model
.
embeddingW
.
grad
);
}
for
(
int
i
=
0
;
i
<
paraList
.
count
;
i
++
)
{
XTensor
*
para
=
(
XTensor
*
)
paraList
.
GetItem
(
i
);
XTensor
*
paraGrad
=
(
XTensor
*
)
gradList
.
GetItem
(
i
);
//fprintf(stderr, "%d\n", i);
//paraGrad->Dump(stderr, "grad:", 10);
/* the delta rule */
_Sum
(
para
,
paraGrad
,
para
,
-
epsilon
);
}
}
/*
get prediction probabilites of the gold words
>> output - output probabilities
>> gold - gold standard
>> wordPobs - probability of each word
<< return - probability of the batch
*/
float
GetProb
(
XTensor
&
output
,
XTensor
&
gold
,
XTensor
*
wordProbs
)
{
XTensor
probs
;
InitTensorV2
(
&
probs
,
&
output
);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply
(
&
output
,
&
gold
,
&
probs
);
/* probability of each word */
XTensor
wprobs
;
InitTensor1DV2
(
&
wprobs
,
output
.
GetDim
(
0
),
output
.
dataType
,
output
.
devID
);
_ReduceSum
(
&
probs
,
&
wprobs
,
1
);
if
(
wordProbs
!=
NULL
)
_CopyValues
(
&
wprobs
,
wordProbs
);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
int
dims
[
2
];
dims
[
0
]
=
1
;
dims
[
1
]
=
probs
.
unitNum
;
probs
.
Reshape
(
2
,
dims
);
/* probability for the batch */
XTensor
result
;
InitTensor1DV2
(
&
result
,
1
,
X_FLOAT
,
output
.
devID
);
_ReduceSum
(
&
probs
,
&
result
,
1
);
return
result
.
Get1D
(
0
);
}
int
pin
=
0
;
int
wordBufCount
=
0
;
/*
load a minibatch of ngrams
>> file - data file
>> n - order of the language model
>> ngrams - the loaded ngrams
>> sentNum - maximum sentences kept in the minibatch
>> wordNum - maximum words kept in the minibatch
*/
int
LoadNGrams
(
FILE
*
file
,
int
n
,
NGram
*
ngrams
,
int
sentNum
,
int
wordNum
)
{
int
num
=
0
;
int
lineNum
=
0
;
while
(
pin
>
0
||
fgets
(
lineBuf
,
MAX_LINE_LENGTH_HERE
-
1
,
file
)){
if
(
pin
<=
0
){
int
len
=
(
int
)
strlen
(
lineBuf
);
while
(
lineBuf
[
len
-
1
]
==
'\r'
||
lineBuf
[
len
-
1
]
==
'\n'
){
lineBuf
[
len
-
1
]
=
0
;
len
--
;
}
len
=
(
int
)
strlen
(
lineBuf
);
if
(
len
==
0
)
continue
;
/* how many characters are in a word */
int
wSize
=
0
;
/* how many words are in the sentence */
int
wNum
=
0
;
int
i
=
0
;
for
(
i
=
pin
;
i
<
len
;
i
++
){
/* load word (id) seperated by space or tab */
if
((
lineBuf
[
i
]
==
' '
||
lineBuf
[
i
]
==
'\t'
)
&&
wSize
>
0
){
lineBuf
[
i
]
=
0
;
wordBuf
[
wNum
++
]
=
atoi
(
lineBuf
+
i
-
wSize
);
wSize
=
0
;
}
else
wSize
++
;
}
if
(
wSize
>
0
)
wordBuf
[
wNum
++
]
=
atoi
(
lineBuf
+
i
-
wSize
);
wordBufCount
=
wNum
;
lineNum
++
;
}
else
lineNum
=
1
;
int
i
=
-
MAX_INT
;
/* create ngrams */
for
(
i
=
MAX
(
pin
,
n
-
1
);
i
<
wordBufCount
-
1
;
i
++
){
memcpy
(
ngrams
[
num
++
].
words
,
wordBuf
+
i
-
n
+
1
,
sizeof
(
int
)
*
n
);
if
(
num
>=
wordNum
)
break
;
}
/* set a finished flag if we reach the end of the sentence*/
if
(
i
>=
wordBufCount
-
1
){
pin
=
0
;
wordBufCount
=
0
;
}
/* record where to start next time if we break in the middle */
else
{
pin
=
i
+
1
;
}
if
((
sentNum
>
0
&&
lineNum
>=
sentNum
)
||
num
>=
wordNum
)
break
;
}
return
num
;
}
/*
make a 2d tensor in zero-one representation
The indexed cell is set to 1, and 0 otherwise.
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> rows - row index
>> cols - column index
>> itemNum - number of non-zero items
>> devID - device id
>> mem - memory pool
*/
void
InitZeroOneTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
int
*
rows
,
int
*
cols
,
int
itemNum
,
int
devID
,
XMem
*
mem
)
{
InitTensor2DV2
(
&
tensor
,
rowNum
,
colNum
,
X_FLOAT
,
devID
);
tensor
.
SetZeroAll
();
/* set none-zero cells */
for
(
int
i
=
0
;
i
<
itemNum
;
i
++
)
tensor
.
Set2D
(
1.0
F
,
rows
[
i
],
cols
[
i
]);
}
/*
make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words
>> ngrams - the ngram batch
>> ngramNum - batch size
>> n - indicate which word is encode for each ngram
>> vSize - vocabulary size
>> devID - device id
>> mem - memory pool
*/
void
MakeWordBatch
(
XTensor
&
batch
,
NGram
*
ngrams
,
int
ngramNum
,
int
n
,
int
vSize
,
int
devID
,
XMem
*
mem
)
{
int
*
rows
=
new
int
[
ngramNum
];
int
*
cols
=
new
int
[
ngramNum
];
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
){
rows
[
i
]
=
i
;
cols
[
i
]
=
ngrams
[
i
].
words
[
n
];
}
InitZeroOneTensor2D
(
batch
,
ngramNum
,
vSize
,
rows
,
cols
,
ngramNum
,
devID
,
mem
);
delete
[]
rows
;
delete
[]
cols
;
}
/*
forward procedure
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
>> net - the network that keeps the internal tensors generated in the process
*/
void
Forward
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
,
FNNNet
&
net
)
{
int
batchSize
=
-
1
;
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
TensorList
eList
(
n
-
1
);
/* previoius n - 1 words */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
){
XTensor
&
input
=
inputs
[
i
];
XTensor
&
w
=
model
.
embeddingW
;
XTensor
&
embedding
=
net
.
embeddings
[
i
];
if
(
batchSize
==
-
1
)
batchSize
=
input
.
dimSize
[
0
];
else
{
CheckErrors
(
batchSize
==
input
.
dimSize
[
0
],
"Wrong input word representations!"
);
}
/* embedding output tensor of position i */
InitModelTensor2D
(
embedding
,
batchSize
,
model
.
eSize
,
model
);
/* generate word embedding of position i:
embedding = input * w */
_MatrixMul
(
&
input
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
embedding
);
eList
.
Add
(
&
net
.
embeddings
[
i
]);
}
/* concatenate word embeddings
embeddingcat = cat(embedding_0...embedding_{n-1}) */
InitModelTensor2D
(
net
.
embeddingCat
,
batchSize
,
(
n
-
1
)
*
model
.
eSize
,
model
);
_Concatenate
(
&
eList
,
&
net
.
embeddingCat
,
1
);
/* go over each hidden layer */
for
(
int
i
=
0
;
i
<
depth
;
i
++
){
XTensor
&
h_pre
=
i
==
0
?
net
.
embeddingCat
:
net
.
hiddens
[
i
-
1
];
XTensor
&
w
=
model
.
hiddenW
[
i
];
XTensor
&
b
=
model
.
hiddenB
[
i
];
XTensor
&
h
=
net
.
hiddens
[
i
];
XTensor
&
s
=
net
.
hiddenStates
[
i
];
InitModelTensor2D
(
h
,
batchSize
,
model
.
hSize
,
model
);
InitModelTensor2D
(
s
,
batchSize
,
model
.
hSize
,
model
);
/* generate hidden states of layer i:
s = h_pre * w */
_MatrixMul
(
&
h_pre
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
s
);
/* make a 2d tensor for the bias term */
XTensor
b2D
;
InitTensorV2
(
&
b2D
,
&
s
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
/* introduce bias term:
s = s + b
NOTE: the trick here is to extend b to a 2d tensor
to fit into the 2d representation in tensor summation */
_Sum
(
&
s
,
&
b2D
,
&
s
);
/* pass the state through the hard tanh function:
h = tanh(s) */
_HardTanH
(
&
s
,
&
h
);
}
/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
y = softmax(h_last * w)
Note that this is the implementation as that in Bengio et al.' paper.
TODO: we add bias term here */
{
XTensor
&
h_last
=
depth
>
0
?
net
.
hiddens
[
depth
-
1
]
:
net
.
embeddingCat
;
XTensor
&
w
=
model
.
outputW
;
XTensor
&
b
=
model
.
outputB
;
XTensor
&
s
=
net
.
stateLast
;
XTensor
&
y
=
output
;
InitModelTensor2D
(
s
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
y
,
batchSize
,
model
.
vSize
,
model
);
/* s = h_last * w */
_MatrixMul
(
&
h_last
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
s
);
XTensor
b2D
;
InitTensorV2
(
&
b2D
,
&
s
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
_Sum
(
&
s
,
&
b2D
,
&
s
);
/* y = softmax(s) */
_LogSoftmax
(
&
s
,
&
y
,
1
);
}
}
/*
backward procedure
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
>> grad - the model that keeps the gradient information
>> net - the network that keeps the internal tensors generated in the process
*/
void
Backward
(
XTensor
inputs
[],
XTensor
&
output
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
,
FNNModel
&
model
,
FNNModel
&
grad
,
FNNNet
&
net
)
{
int
batchSize
=
output
.
GetDim
(
0
);
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
/* back-propagation for the output layer */
XTensor
&
y
=
output
;
XTensor
&
s
=
net
.
stateLast
;
XTensor
&
x
=
depth
>
0
?
net
.
hiddens
[
depth
-
1
]
:
net
.
embeddingCat
;
XTensor
&
w
=
model
.
outputW
;
XTensor
&
dedw
=
grad
.
outputW
;
XTensor
&
dedb
=
grad
.
outputB
;
XTensor
deds
(
&
y
);
XTensor
dedx
(
&
x
);
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward
(
&
gold
,
&
y
,
&
s
,
NULL
,
&
deds
,
NULL
,
1
,
loss
);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
= dE/ds_j * x_{i}
(where i and j are the row and column indices, and
x is the top most hidden layer)
so we know
dE/dw = x^T * dE/ds */
_MatrixMul
(
&
x
,
X_TRANS
,
&
deds
,
X_NOTRANS
,
&
dedw
);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum
(
&
deds
,
&
dedb
,
0
);
/* then, we compute
dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
= \sum_j' (dE/ds_{j'} * w_{j, j'})
i.e.,
dE/dx = dE/ds * w^T */
_MatrixMul
(
&
deds
,
X_NOTRANS
,
&
w
,
X_TRANS
,
&
dedx
);
XTensor
&
gradPassed
=
dedx
;
XTensor
dedsHidden
;
XTensor
dedxBottom
;
if
(
depth
>
0
)
InitTensorV2
(
&
dedsHidden
,
&
dedx
);
InitTensorV2
(
&
dedxBottom
,
&
net
.
embeddingCat
);
/* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s)
s = x * w + b */
for
(
int
i
=
depth
-
1
;
i
>=
0
;
i
--
)
{
XTensor
&
h
=
net
.
hiddens
[
i
];
XTensor
&
s
=
net
.
hiddenStates
[
i
];
XTensor
&
x
=
i
==
0
?
net
.
embeddingCat
:
net
.
hiddenStates
[
i
-
1
];
XTensor
&
w
=
model
.
hiddenW
[
i
];
XTensor
&
dedh
=
gradPassed
;
// gradient passed though the previous layer
XTensor
&
dedx
=
i
==
0
?
dedxBottom
:
dedh
;
XTensor
&
deds
=
dedsHidden
;
XTensor
&
dedw
=
grad
.
hiddenW
[
i
];
XTensor
&
dedb
=
grad
.
hiddenB
[
i
];
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
_HardTanHBackward
(
NULL
,
&
h
,
&
s
,
&
dedh
,
&
deds
,
NOLOSS
);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul
(
&
x
,
X_TRANS
,
&
deds
,
X_NOTRANS
,
&
dedw
);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum
(
&
deds
,
&
dedb
,
0
);
/* gradient of the input: dE/dx = dE/ds * w^T */
_MatrixMul
(
&
deds
,
X_NOTRANS
,
&
w
,
X_TRANS
,
&
dedx
);
if
(
i
>
0
)
_CopyValues
(
&
dedx
,
&
gradPassed
);
}
TensorList
eList
(
n
-
1
);
/* back-propagation for the embedding layer */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
{
XTensor
*
dedy
=
NewTensor2DV2
(
batchSize
,
model
.
eSize
,
X_FLOAT
,
model
.
devID
);
eList
.
Add
(
dedy
);
}
/* gradient of the concatenation of the embedding layers */
XTensor
&
dedyCat
=
depth
>
0
?
dedxBottom
:
dedx
;
/* split the concatenation of gradients of the embeddings */
_Split
(
&
dedyCat
,
&
eList
,
1
,
n
-
1
);
/* go over for each word */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
{
XTensor
*
dedy
=
(
XTensor
*
)
eList
.
GetItem
(
i
);
XTensor
&
x
=
inputs
[
i
];
XTensor
&
dedw
=
grad
.
embeddingW
;
/* gradient of the embedding weight: dE/dw += x^T * dE/dy
NOTE that we accumulate dE/dw here because the matrix w
is shared by several layers (or words) */
_MatrixMul
(
&
x
,
X_TRANS
,
dedy
,
X_NOTRANS
,
&
dedw
,
1.0
F
,
1.0
F
);
delete
dedy
;
}
}
/*
forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words
>> output - output probability
>> model - the fnn model
*/
void
ForwardAutoDiff
(
NGram
*
ngrams
,
int
batch
,
XTensor
&
output
,
FNNModel
&
model
)
{
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
XTensor
words
;
XTensor
embeddingBig
;
XTensor
hidden
;
XTensor
b
;
int
size
=
batch
*
(
n
-
1
);
int
*
index
=
new
int
[
size
];
for
(
int
i
=
0
;
i
<
batch
;
i
++
){
for
(
int
j
=
0
;
j
<
n
-
1
;
j
++
){
int
a
=
i
*
(
n
-
1
)
+
j
;
index
[
a
]
=
ngrams
[
i
].
words
[
j
];
}
}
InitTensor1DV2
(
&
words
,
size
,
X_INT
,
model
.
devID
);
words
.
SetData
(
index
,
size
);
embeddingBig
=
Gather
(
model
.
embeddingW
,
words
);
delete
[]
line
;
delete
[]
index
;
}
int
dimSize
[
2
];
dimSize
[
0
]
=
embeddingBig
.
GetDim
(
0
)
/
(
n
-
1
);
dimSize
[
1
]
=
embeddingBig
.
GetDim
(
1
)
*
(
n
-
1
);
char
lineBuf
[
MAX_LINE_LENGTH_HERE
];
int
wordBuf
[
MAX_LINE_LENGTH_HERE
];
hidden
=
Reshape
(
embeddingBig
,
embeddingBig
.
order
,
dimSize
);
/*
train the model with the standard SGD method
>> train - training data file
>> isShuffled - shuffle the data file or not
>> model - the fnn model
*/
void
Train
(
const
char
*
train
,
bool
isShuffled
,
FNNModel
&
model
)
{
char
name
[
MAX_NAME_LENGTH
];
/* hidden layers */
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
hidden
=
HardTanH
(
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
model
.
hiddenB
[
i
]);
/* shuffle the data */
if
(
isShuffled
)
{
sprintf
(
name
,
"%s-tmp"
,
train
);
Shuffle
(
train
,
name
);
}
else
strcpy
(
name
,
train
);
/* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output
=
Softmax
(
MMul
(
hidden
,
model
.
outputW
)
+
model
.
outputB
,
1
);
}
int
epoch
=
0
;
int
step
=
0
;
int
wordCount
=
0
;
int
wordCountTotal
=
0
;
int
ngramNum
=
1
;
float
loss
=
0
;
bool
isEnd
=
false
;
NGram
*
ngrams
=
new
NGram
[
MAX_LINE_LENGTH_HERE
];
/* make a model to keep gradients */
FNNModel
grad
;
Copy
(
grad
,
model
);
/* XNet for automatic differentiation */
XNet
autoDiffer
;
double
startT
=
GetClockSec
();
/* iterate for a number of epochs */
for
(
epoch
=
0
;
epoch
<
nEpoch
;
epoch
++
)
{
/* data file */
FILE
*
file
=
fopen
(
name
,
"rb"
);
CheckErrors
(
file
,
"Cannot open the training file"
);
wordCount
=
0
;
loss
=
0
;
ngramNum
=
1
;
/*
forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
*/
void
ForwardAutoDiff
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
)
{
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
XTensor
words
;
XTensor
embeddingBig
;
XTensor
hidden
;
XTensor
b
;
TensorList
inputList
(
n
-
1
);
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
inputList
.
Add
(
inputs
+
i
);
/* represent n - 1 words in one tensor */
words
=
Merge
(
inputList
,
0
);
/* word embedding */
embeddingBig
=
MMul
(
words
,
model
.
embeddingW
);
/* input of the first hidden layer */
hidden
=
Split
(
embeddingBig
,
0
,
n
-
1
);
hidden
=
Merge
(
hidden
,
2
,
0
);
/* hidden layers */
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
hidden
=
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
model
.
hiddenB
[
i
];
while
(
ngramNum
>
0
)
{
/* load a minibatch of ngrams */
ngramNum
=
LoadNGrams
(
file
,
model
.
n
,
ngrams
,
sentBatch
,
wordBatch
);
/* output layer */
output
=
LogSoftmax
(
MMul
(
hidden
,
model
.
outputW
)
+
model
.
outputB
,
1
)
;
if
(
ngramNum
<=
0
)
break
;
}
/*
dump the model to the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void
Dump
(
const
char
*
fn
,
FNNModel
&
model
)
{
FILE
*
file
=
fopen
(
fn
,
"wb"
);
CheckErrors
(
file
,
"Cannot open the model file"
);
model
.
embeddingW
.
Dump
(
file
,
"embedding w:"
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
char
name
[
MAX_NAME_LENGTH
];
sprintf
(
name
,
"hidden %d w:"
,
i
);
model
.
hiddenW
[
i
].
Dump
(
file
,
name
);
sprintf
(
name
,
"hidden %d b:"
,
i
);
model
.
hiddenB
[
i
].
Dump
(
file
,
name
);
}
model
.
outputW
.
Dump
(
file
,
"output w:"
);
model
.
outputB
.
Dump
(
file
,
"output b:"
);
fclose
(
file
);
XPRINT
(
0
,
stderr
,
"[INFO] model saved
\n
"
);
}
/*
read the model from the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void
Read
(
const
char
*
fn
,
FNNModel
&
model
)
{
FILE
*
file
=
fopen
(
fn
,
"rb"
);
CheckErrors
(
file
,
"Cannot open the model file"
);
model
.
embeddingW
.
Read
(
file
,
"embedding w:"
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
char
name
[
MAX_NAME_LENGTH
];
sprintf
(
name
,
"hidden %d w:"
,
i
);
model
.
hiddenW
[
i
].
Read
(
file
,
name
);
sprintf
(
name
,
"hidden %d b:"
,
i
);
model
.
hiddenB
[
i
].
Read
(
file
,
name
);
}
model
.
outputW
.
Read
(
file
,
"output w:"
);
model
.
outputB
.
Read
(
file
,
"output b:"
);
fclose
(
file
);
XPRINT
(
0
,
stderr
,
"[INFO] model loaded
\n
"
);
}
/*
test the model
>> test - test data file
>> result - where to keep the result
>> model - the fnn model
*/
void
Test
(
const
char
*
test
,
const
char
*
result
,
FNNModel
&
model
)
{
int
wordCount
=
0
;
int
sentCount
=
0
;
float
loss
=
0
;
NGram
*
ngrams
=
new
NGram
[
MAX_LINE_LENGTH_HERE
];
double
startT
=
GetClockSec
();
/* data files */
FILE
*
file
=
fopen
(
test
,
"rb"
);
CheckErrors
(
file
,
"Cannot read the test file"
);
FILE
*
ofile
=
fopen
(
result
,
"wb"
);
CheckErrors
(
ofile
,
"Cannot open the output file"
);
int
ngramNum
=
1
;
while
(
ngramNum
>
0
)
{
/* load a minibatch of ngrams */
ngramNum
=
LoadNGrams
(
file
,
model
.
n
,
ngrams
,
1
,
MAX_INT
);
if
(
ngramNum
<=
0
)
break
;
/* previous n - 1 words */
XTensor
inputs
[
MAX_N_GRAM
];
/* the predicted word */
XTensor
output
;
/* the gold standard */
XTensor
gold
;
/* make the input tensor for position i */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
MakeWordBatch
(
inputs
[
i
],
ngrams
,
ngramNum
,
i
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
/* make the gold tensor */
MakeWordBatch
(
gold
,
ngrams
,
ngramNum
,
model
.
n
-
1
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
if
(
!
autoDiff
)
{
/* prepare an empty network for building the fnn */
FNNNet
net
;
/* forward computation */
Forward
(
inputs
,
output
,
model
,
net
);
}
else
{
/* this is implemented by gather function */
ForwardAutoDiff
(
ngrams
,
ngramNum
,
output
,
model
);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
XTensor
probs
;
InitTensor1DV2
(
&
probs
,
ngramNum
);
/* get probabilities */
float
prob
=
GetProb
(
output
,
gold
,
&
probs
);
/* dump the test result */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
fprintf
(
ofile
,
"%d "
,
ngrams
[
0
].
words
[
i
]);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%d "
,
ngrams
[
i
].
words
[
model
.
n
-
1
]);
fprintf
(
ofile
,
"||| "
);
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
fprintf
(
ofile
,
"<s> "
);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%f "
,
probs
.
Get1D
(
i
));
fprintf
(
ofile
,
"||| %f
\n
"
,
prob
);
loss
+=
-
prob
;
wordCount
+=
ngramNum
;
sentCount
+=
1
;
}
fclose
(
file
);
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT1
(
0
,
stderr
,
"[INFO] ppl=%.2f
\n
"
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)
\n
"
,
elapsed
,
sentCount
,
wordCount
);
delete
[]
ngrams
;
}
/* previous n - 1 words */
XTensor
inputs
[
MAX_N_GRAM
];
/* the predicted word */
XTensor
output
;
/* the gold standard */
XTensor
gold
;
/* the loss tensor */
XTensor
lossTensor
;
/* make the input tensor for position i */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
MakeWordBatch
(
inputs
[
i
],
ngrams
,
ngramNum
,
i
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
/* make the gold tensor */
MakeWordBatch
(
gold
,
ngrams
,
ngramNum
,
model
.
n
-
1
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
if
(
!
autoDiff
)
{
/* prepare an empty network for building the fnn */
FNNNet
net
;
/* gradident = 0 */
Clear
(
grad
,
false
);
/* forward computation */
Forward
(
inputs
,
output
,
model
,
net
);
/* backward computation to obtain gradients */
Backward
(
inputs
,
output
,
gold
,
CROSSENTROPY
,
model
,
grad
,
net
);
/* update model parameters */
Update
(
model
,
grad
,
learningRate
,
false
);
}
else
{
/* gradient = 0 */
Clear
(
model
,
true
);
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff
(
ngrams
,
ngramNum
,
output
,
model
);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
lossTensor
=
CrossEntropy
(
output
,
gold
);
output
.
Dump
(
stderr
,
"output:"
,
10
);
gold
.
Dump
(
stderr
,
"gold:"
,
10
);
lossTensor
.
Dump
(
stderr
,
"lossTensor:"
,
10
);
/* automatic differentiation */
autoDiffer
.
Backward
(
lossTensor
);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update
(
model
,
grad
,
learningRate
,
true
);
}
/* get probabilities */
float
prob
=
GetProb
(
output
,
gold
);
if
(
autoDiff
)
{
prob
=
-
ReduceSumAll
(
lossTensor
);
}
//printf("prob:%f", prob);
loss
+=
-
prob
;
wordCount
+=
ngramNum
;
wordCountTotal
+=
ngramNum
;
if
(
++
step
>=
nStep
)
{
isEnd
=
true
;
break
;
}
if
(
step
%
100
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT5
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f
\n
"
,
elapsed
,
step
,
epoch
+
1
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
}
}
fclose
(
file
);
if
(
isEnd
)
break
;
Test
(
testFN
,
outputFN
,
model
);
}
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT5
(
0
,
stderr
,
"[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f
\n
"
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] training finished (took %.1fs, step=%d and epoch=%d)
\n
"
,
elapsed
,
step
,
epoch
);
delete
[]
ngrams
;
}
/*
update the model parameters using the delta rule
>> model - the model to update
>> grad - gradients
>> epsilon - learning rate
>> isNodeGrad - indicates whether the gradient is associated with the node
*/
void
Update
(
FNNModel
&
model
,
FNNModel
&
grad
,
float
epsilon
,
bool
isNodeGrad
)
{
TensorList
paraList
(
10
);
TensorList
gradList
(
10
);
paraList
.
Add
(
&
model
.
outputW
);
paraList
.
Add
(
&
model
.
outputB
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
paraList
.
Add
(
&
model
.
hiddenW
[
i
]);
paraList
.
Add
(
&
model
.
hiddenB
[
i
]);
}
paraList
.
Add
(
&
model
.
embeddingW
);
if
(
!
isNodeGrad
)
{
gradList
.
Add
(
&
grad
.
outputW
);
gradList
.
Add
(
&
grad
.
outputB
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
gradList
.
Add
(
&
grad
.
hiddenW
[
i
]);
gradList
.
Add
(
&
grad
.
hiddenB
[
i
]);
}
;
gradList
.
Add
(
&
grad
.
embeddingW
);
}
else
{
gradList
.
Add
(
model
.
outputW
.
grad
);
gradList
.
Add
(
model
.
outputB
.
grad
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
gradList
.
Add
(
model
.
hiddenW
[
i
].
grad
);
gradList
.
Add
(
model
.
hiddenB
[
i
].
grad
);
}
gradList
.
Add
(
model
.
embeddingW
.
grad
);
}
for
(
int
i
=
0
;
i
<
paraList
.
count
;
i
++
)
{
XTensor
*
para
=
(
XTensor
*
)
paraList
.
GetItem
(
i
);
XTensor
*
paraGrad
=
(
XTensor
*
)
gradList
.
GetItem
(
i
);
//fprintf(stderr, "%d\n", i);
//paraGrad->Dump(stderr, "grad:", 10);
/* the delta rule */
_Sum
(
para
,
paraGrad
,
para
,
-
epsilon
);
}
}
/*
get prediction probabilites of the gold words
>> output - output probabilities
>> gold - gold standard
>> wordPobs - probability of each word
<< return - probability of the batch
*/
float
GetProb
(
XTensor
&
output
,
XTensor
&
gold
,
XTensor
*
wordProbs
)
{
XTensor
probs
;
InitTensorV2
(
&
probs
,
&
output
);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply
(
&
output
,
&
gold
,
&
probs
);
/* probability of each word */
XTensor
wprobs
;
InitTensor1DV2
(
&
wprobs
,
output
.
GetDim
(
0
),
output
.
dataType
,
output
.
devID
);
_ReduceSum
(
&
probs
,
&
wprobs
,
1
);
if
(
wordProbs
!=
NULL
)
_CopyValues
(
&
wprobs
,
wordProbs
);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
int
dims
[
2
];
dims
[
0
]
=
1
;
dims
[
1
]
=
probs
.
unitNum
;
probs
.
Reshape
(
2
,
dims
);
/* probability for the batch */
XTensor
result
;
InitTensor1DV2
(
&
result
,
1
,
X_FLOAT
,
output
.
devID
);
_ReduceSum
(
&
probs
,
&
result
,
1
);
return
result
.
Get1D
(
0
);
}
int
pin
=
0
;
int
wordBufCount
=
0
;
/*
load a minibatch of ngrams
>> file - data file
>> n - order of the language model
>> ngrams - the loaded ngrams
>> sentNum - maximum sentences kept in the minibatch
>> wordNum - maximum words kept in the minibatch
*/
int
LoadNGrams
(
FILE
*
file
,
int
n
,
NGram
*
ngrams
,
int
sentNum
,
int
wordNum
)
{
int
num
=
0
;
int
lineNum
=
0
;
while
(
pin
>
0
||
fgets
(
lineBuf
,
MAX_LINE_LENGTH_HERE
-
1
,
file
))
{
if
(
pin
<=
0
)
{
int
len
=
(
int
)
strlen
(
lineBuf
);
while
(
lineBuf
[
len
-
1
]
==
'\r'
||
lineBuf
[
len
-
1
]
==
'\n'
)
{
lineBuf
[
len
-
1
]
=
0
;
len
--
;
}
len
=
(
int
)
strlen
(
lineBuf
);
if
(
len
==
0
)
continue
;
/* how many characters are in a word */
int
wSize
=
0
;
/* how many words are in the sentence */
int
wNum
=
0
;
int
i
=
0
;
for
(
i
=
pin
;
i
<
len
;
i
++
)
{
/* load word (id) seperated by space or tab */
if
((
lineBuf
[
i
]
==
' '
||
lineBuf
[
i
]
==
'\t'
)
&&
wSize
>
0
)
{
lineBuf
[
i
]
=
0
;
wordBuf
[
wNum
++
]
=
atoi
(
lineBuf
+
i
-
wSize
);
wSize
=
0
;
}
else
wSize
++
;
}
if
(
wSize
>
0
)
wordBuf
[
wNum
++
]
=
atoi
(
lineBuf
+
i
-
wSize
);
wordBufCount
=
wNum
;
lineNum
++
;
}
else
lineNum
=
1
;
int
i
=
-
MAX_INT
;
/* create ngrams */
for
(
i
=
MAX
(
pin
,
n
-
1
);
i
<
wordBufCount
-
1
;
i
++
)
{
memcpy
(
ngrams
[
num
++
].
words
,
wordBuf
+
i
-
n
+
1
,
sizeof
(
int
)
*
n
);
if
(
num
>=
wordNum
)
break
;
}
/* set a finished flag if we reach the end of the sentence*/
if
(
i
>=
wordBufCount
-
1
)
{
pin
=
0
;
wordBufCount
=
0
;
}
/* record where to start next time if we break in the middle */
else
{
pin
=
i
+
1
;
}
if
((
sentNum
>
0
&&
lineNum
>=
sentNum
)
||
num
>=
wordNum
)
break
;
}
return
num
;
}
/*
make a 2d tensor in zero-one representation
The indexed cell is set to 1, and 0 otherwise.
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> rows - row index
>> cols - column index
>> itemNum - number of non-zero items
>> devID - device id
>> mem - memory pool
*/
void
InitZeroOneTensor2D
(
XTensor
&
tensor
,
int
rowNum
,
int
colNum
,
int
*
rows
,
int
*
cols
,
int
itemNum
,
int
devID
,
XMem
*
mem
)
{
InitTensor2DV2
(
&
tensor
,
rowNum
,
colNum
,
X_FLOAT
,
devID
);
tensor
.
SetZeroAll
();
/* set none-zero cells */
for
(
int
i
=
0
;
i
<
itemNum
;
i
++
)
tensor
.
Set2D
(
1.0
F
,
rows
[
i
],
cols
[
i
]);
}
/*
make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words
>> ngrams - the ngram batch
>> ngramNum - batch size
>> n - indicate which word is encode for each ngram
>> vSize - vocabulary size
>> devID - device id
>> mem - memory pool
*/
void
MakeWordBatch
(
XTensor
&
batch
,
NGram
*
ngrams
,
int
ngramNum
,
int
n
,
int
vSize
,
int
devID
,
XMem
*
mem
)
{
int
*
rows
=
new
int
[
ngramNum
];
int
*
cols
=
new
int
[
ngramNum
];
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
{
rows
[
i
]
=
i
;
cols
[
i
]
=
ngrams
[
i
].
words
[
n
];
}
InitZeroOneTensor2D
(
batch
,
ngramNum
,
vSize
,
rows
,
cols
,
ngramNum
,
devID
,
mem
);
delete
[]
rows
;
delete
[]
cols
;
}
/*
forward procedure
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
>> net - the network that keeps the internal tensors generated in the process
*/
void
Forward
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
,
FNNNet
&
net
)
{
int
batchSize
=
-
1
;
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
TensorList
eList
(
n
-
1
);
/* previoius n - 1 words */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
{
XTensor
&
input
=
inputs
[
i
];
XTensor
&
w
=
model
.
embeddingW
;
XTensor
&
embedding
=
net
.
embeddings
[
i
];
if
(
batchSize
==
-
1
)
batchSize
=
input
.
dimSize
[
0
];
else
{
CheckErrors
(
batchSize
==
input
.
dimSize
[
0
],
"Wrong input word representations!"
);
}
/* embedding output tensor of position i */
InitModelTensor2D
(
embedding
,
batchSize
,
model
.
eSize
,
model
);
/* generate word embedding of position i:
embedding = input * w */
_MatrixMul
(
&
input
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
embedding
);
eList
.
Add
(
&
net
.
embeddings
[
i
]);
}
/* concatenate word embeddings
embeddingcat = cat(embedding_0...embedding_{n-1}) */
InitModelTensor2D
(
net
.
embeddingCat
,
batchSize
,
(
n
-
1
)
*
model
.
eSize
,
model
);
_Concatenate
(
&
eList
,
&
net
.
embeddingCat
,
1
);
/* go over each hidden layer */
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
{
XTensor
&
h_pre
=
i
==
0
?
net
.
embeddingCat
:
net
.
hiddens
[
i
-
1
];
XTensor
&
w
=
model
.
hiddenW
[
i
];
XTensor
&
b
=
model
.
hiddenB
[
i
];
XTensor
&
h
=
net
.
hiddens
[
i
];
XTensor
&
s
=
net
.
hiddenStates
[
i
];
InitModelTensor2D
(
h
,
batchSize
,
model
.
hSize
,
model
);
InitModelTensor2D
(
s
,
batchSize
,
model
.
hSize
,
model
);
/* generate hidden states of layer i:
s = h_pre * w */
_MatrixMul
(
&
h_pre
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
s
);
/* make a 2d tensor for the bias term */
XTensor
b2D
;
InitTensorV2
(
&
b2D
,
&
s
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
/* introduce bias term:
s = s + b
NOTE: the trick here is to extend b to a 2d tensor
to fit into the 2d representation in tensor summation */
_Sum
(
&
s
,
&
b2D
,
&
s
);
/* pass the state through the hard tanh function:
h = tanh(s) */
_HardTanH
(
&
s
,
&
h
);
}
/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
y = softmax(h_last * w)
Note that this is the implementation as that in Bengio et al.' paper.
TODO: we add bias term here */
{
XTensor
&
h_last
=
depth
>
0
?
net
.
hiddens
[
depth
-
1
]
:
net
.
embeddingCat
;
XTensor
&
w
=
model
.
outputW
;
XTensor
&
b
=
model
.
outputB
;
XTensor
&
s
=
net
.
stateLast
;
XTensor
&
y
=
output
;
InitModelTensor2D
(
s
,
batchSize
,
model
.
vSize
,
model
);
InitModelTensor2D
(
y
,
batchSize
,
model
.
vSize
,
model
);
/* s = h_last * w */
_MatrixMul
(
&
h_last
,
X_NOTRANS
,
&
w
,
X_NOTRANS
,
&
s
);
XTensor
b2D
;
InitTensorV2
(
&
b2D
,
&
s
);
_Unsqueeze
(
&
b
,
&
b2D
,
0
,
batchSize
);
_Sum
(
&
s
,
&
b2D
,
&
s
);
/* y = softmax(s) */
_LogSoftmax
(
&
s
,
&
y
,
1
);
}
}
/*
backward procedure
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
>> grad - the model that keeps the gradient information
>> net - the network that keeps the internal tensors generated in the process
*/
void
Backward
(
XTensor
inputs
[],
XTensor
&
output
,
XTensor
&
gold
,
LOSS_FUNCTION_NAME
loss
,
FNNModel
&
model
,
FNNModel
&
grad
,
FNNNet
&
net
)
{
int
batchSize
=
output
.
GetDim
(
0
);
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
/* back-propagation for the output layer */
XTensor
&
y
=
output
;
XTensor
&
s
=
net
.
stateLast
;
XTensor
&
x
=
depth
>
0
?
net
.
hiddens
[
depth
-
1
]
:
net
.
embeddingCat
;
XTensor
&
w
=
model
.
outputW
;
XTensor
&
dedw
=
grad
.
outputW
;
XTensor
&
dedb
=
grad
.
outputB
;
XTensor
deds
(
&
y
);
XTensor
dedx
(
&
x
);
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward
(
&
gold
,
&
y
,
&
s
,
NULL
,
&
deds
,
NULL
,
1
,
loss
);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
= dE/ds_j * x_{i}
(where i and j are the row and column indices, and
x is the top most hidden layer)
so we know
dE/dw = x^T * dE/ds */
_MatrixMul
(
&
x
,
X_TRANS
,
&
deds
,
X_NOTRANS
,
&
dedw
);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum
(
&
deds
,
&
dedb
,
0
);
/* then, we compute
dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
= \sum_j' (dE/ds_{j'} * w_{j, j'})
i.e.,
dE/dx = dE/ds * w^T */
_MatrixMul
(
&
deds
,
X_NOTRANS
,
&
w
,
X_TRANS
,
&
dedx
);
XTensor
&
gradPassed
=
dedx
;
XTensor
dedsHidden
;
XTensor
dedxBottom
;
if
(
depth
>
0
)
InitTensorV2
(
&
dedsHidden
,
&
dedx
);
InitTensorV2
(
&
dedxBottom
,
&
net
.
embeddingCat
);
/* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s)
s = x * w + b */
for
(
int
i
=
depth
-
1
;
i
>=
0
;
i
--
)
{
XTensor
&
h
=
net
.
hiddens
[
i
];
XTensor
&
s
=
net
.
hiddenStates
[
i
];
XTensor
&
x
=
i
==
0
?
net
.
embeddingCat
:
net
.
hiddenStates
[
i
-
1
];
XTensor
&
w
=
model
.
hiddenW
[
i
];
XTensor
&
dedh
=
gradPassed
;
// gradient passed though the previous layer
XTensor
&
dedx
=
i
==
0
?
dedxBottom
:
dedh
;
XTensor
&
deds
=
dedsHidden
;
XTensor
&
dedw
=
grad
.
hiddenW
[
i
];
XTensor
&
dedb
=
grad
.
hiddenB
[
i
];
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
_HardTanHBackward
(
NULL
,
&
h
,
&
s
,
&
dedh
,
&
deds
,
NOLOSS
);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul
(
&
x
,
X_TRANS
,
&
deds
,
X_NOTRANS
,
&
dedw
);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum
(
&
deds
,
&
dedb
,
0
);
/* gradient of the input: dE/dx = dE/ds * w^T */
_MatrixMul
(
&
deds
,
X_NOTRANS
,
&
w
,
X_TRANS
,
&
dedx
);
if
(
i
>
0
)
_CopyValues
(
&
dedx
,
&
gradPassed
);
}
TensorList
eList
(
n
-
1
);
/* back-propagation for the embedding layer */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
{
XTensor
*
dedy
=
NewTensor2DV2
(
batchSize
,
model
.
eSize
,
X_FLOAT
,
model
.
devID
);
eList
.
Add
(
dedy
);
}
/* gradient of the concatenation of the embedding layers */
XTensor
&
dedyCat
=
depth
>
0
?
dedxBottom
:
dedx
;
/* split the concatenation of gradients of the embeddings */
_Split
(
&
dedyCat
,
&
eList
,
1
,
n
-
1
);
/* go over for each word */
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
{
XTensor
*
dedy
=
(
XTensor
*
)
eList
.
GetItem
(
i
);
XTensor
&
x
=
inputs
[
i
];
XTensor
&
dedw
=
grad
.
embeddingW
;
/* gradient of the embedding weight: dE/dw += x^T * dE/dy
NOTE that we accumulate dE/dw here because the matrix w
is shared by several layers (or words) */
_MatrixMul
(
&
x
,
X_TRANS
,
dedy
,
X_NOTRANS
,
&
dedw
,
1.0
F
,
1.0
F
);
delete
dedy
;
}
}
/*
forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words
>> output - output probability
>> model - the fnn model
*/
void
ForwardAutoDiff
(
NGram
*
ngrams
,
int
batch
,
XTensor
&
output
,
FNNModel
&
model
)
{
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
XTensor
words
;
XTensor
embeddingBig
;
XTensor
hidden
;
XTensor
b
;
int
size
=
batch
*
(
n
-
1
);
int
*
index
=
new
int
[
size
];
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
-
1
;
j
++
)
{
int
a
=
i
*
(
n
-
1
)
+
j
;
index
[
a
]
=
ngrams
[
i
].
words
[
j
];
}
}
InitTensor1DV2
(
&
words
,
size
,
X_INT
,
model
.
devID
);
words
.
SetData
(
index
,
size
);
words
.
Dump
(
stderr
,
"word:"
,
10
);
embeddingBig
=
Gather
(
model
.
embeddingW
,
words
);
delete
[]
index
;
int
dimSize
[
2
];
dimSize
[
0
]
=
embeddingBig
.
GetDim
(
0
)
/
(
n
-
1
);
dimSize
[
1
]
=
embeddingBig
.
GetDim
(
1
)
*
(
n
-
1
);
embeddingBig
.
Dump
(
stderr
,
"embeddingBig:"
,
10
);
hidden
=
Reshape
(
embeddingBig
,
embeddingBig
.
order
,
dimSize
);
hidden
.
Dump
(
stderr
,
"hidden-0:"
,
10
);
/* hidden layers */
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
hidden
=
HardTanH
(
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
model
.
hiddenB
[
i
]);
hidden
.
Dump
(
stderr
,
"hidden-1:"
,
10
);
/* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output
=
Softmax
(
MMul
(
hidden
,
model
.
outputW
)
+
model
.
outputB
,
1
);
}
/*
forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
*/
void
ForwardAutoDiff
(
XTensor
inputs
[],
XTensor
&
output
,
FNNModel
&
model
)
{
int
n
=
model
.
n
;
int
depth
=
model
.
hDepth
;
XTensor
words
;
XTensor
embeddingBig
;
XTensor
hidden
;
XTensor
b
;
TensorList
inputList
(
n
-
1
);
for
(
int
i
=
0
;
i
<
n
-
1
;
i
++
)
inputList
.
Add
(
inputs
+
i
);
/* represent n - 1 words in one tensor */
words
=
Merge
(
inputList
,
0
);
/* word embedding */
embeddingBig
=
MMul
(
words
,
model
.
embeddingW
);
/* input of the first hidden layer */
hidden
=
Split
(
embeddingBig
,
0
,
n
-
1
);
hidden
=
Merge
(
hidden
,
2
,
0
);
/* hidden layers */
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
hidden
=
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
model
.
hiddenB
[
i
];
/* output layer */
output
=
LogSoftmax
(
MMul
(
hidden
,
model
.
outputW
)
+
model
.
outputB
,
1
);
}
/*
dump the model to the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void
Dump
(
const
char
*
fn
,
FNNModel
&
model
)
{
FILE
*
file
=
fopen
(
fn
,
"wb"
);
CheckErrors
(
file
,
"Cannot open the model file"
);
model
.
embeddingW
.
Dump
(
file
,
"embedding w:"
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
char
name
[
MAX_NAME_LENGTH
];
sprintf
(
name
,
"hidden %d w:"
,
i
);
model
.
hiddenW
[
i
].
Dump
(
file
,
name
);
sprintf
(
name
,
"hidden %d b:"
,
i
);
model
.
hiddenB
[
i
].
Dump
(
file
,
name
);
}
model
.
outputW
.
Dump
(
file
,
"output w:"
);
model
.
outputB
.
Dump
(
file
,
"output b:"
);
fclose
(
file
);
XPRINT
(
0
,
stderr
,
"[INFO] model saved
\n
"
);
}
/*
read the model from the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void
Read
(
const
char
*
fn
,
FNNModel
&
model
)
{
FILE
*
file
=
fopen
(
fn
,
"rb"
);
CheckErrors
(
file
,
"Cannot open the model file"
);
model
.
embeddingW
.
Read
(
file
,
"embedding w:"
);
for
(
int
i
=
0
;
i
<
model
.
hDepth
;
i
++
)
{
char
name
[
MAX_NAME_LENGTH
];
sprintf
(
name
,
"hidden %d w:"
,
i
);
model
.
hiddenW
[
i
].
Read
(
file
,
name
);
sprintf
(
name
,
"hidden %d b:"
,
i
);
model
.
hiddenB
[
i
].
Read
(
file
,
name
);
}
model
.
outputW
.
Read
(
file
,
"output w:"
);
model
.
outputB
.
Read
(
file
,
"output b:"
);
fclose
(
file
);
XPRINT
(
0
,
stderr
,
"[INFO] model loaded
\n
"
);
}
/*
test the model
>> test - test data file
>> result - where to keep the result
>> model - the fnn model
*/
void
Test
(
const
char
*
test
,
const
char
*
result
,
FNNModel
&
model
)
{
int
wordCount
=
0
;
int
sentCount
=
0
;
float
loss
=
0
;
NGram
*
ngrams
=
new
NGram
[
MAX_LINE_LENGTH_HERE
];
double
startT
=
GetClockSec
();
/* data files */
FILE
*
file
=
fopen
(
test
,
"rb"
);
CheckErrors
(
file
,
"Cannot read the test file"
);
FILE
*
ofile
=
fopen
(
result
,
"wb"
);
CheckErrors
(
ofile
,
"Cannot open the output file"
);
int
ngramNum
=
1
;
while
(
ngramNum
>
0
)
{
/* load a minibatch of ngrams */
ngramNum
=
LoadNGrams
(
file
,
model
.
n
,
ngrams
,
1
,
MAX_INT
);
if
(
ngramNum
<=
0
)
break
;
/* previous n - 1 words */
XTensor
inputs
[
MAX_N_GRAM
];
/* the predicted word */
XTensor
output
;
/* the gold standard */
XTensor
gold
;
/* make the input tensor for position i */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
MakeWordBatch
(
inputs
[
i
],
ngrams
,
ngramNum
,
i
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
/* make the gold tensor */
MakeWordBatch
(
gold
,
ngrams
,
ngramNum
,
model
.
n
-
1
,
model
.
vSize
,
model
.
devID
,
model
.
mem
);
if
(
!
autoDiff
)
{
/* prepare an empty network for building the fnn */
FNNNet
net
;
/* forward computation */
Forward
(
inputs
,
output
,
model
,
net
);
}
else
{
/* this is implemented by gather function */
ForwardAutoDiff
(
ngrams
,
ngramNum
,
output
,
model
);
output
=
Log
(
output
);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
XTensor
probs
;
InitTensor1DV2
(
&
probs
,
ngramNum
);
/* get probabilities */
float
prob
=
GetProb
(
output
,
gold
,
&
probs
);
/* dump the test result */
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
fprintf
(
ofile
,
"%d "
,
ngrams
[
0
].
words
[
i
]);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%d "
,
ngrams
[
i
].
words
[
model
.
n
-
1
]);
fprintf
(
ofile
,
"||| "
);
for
(
int
i
=
0
;
i
<
model
.
n
-
1
;
i
++
)
fprintf
(
ofile
,
"<s> "
);
for
(
int
i
=
0
;
i
<
ngramNum
;
i
++
)
fprintf
(
ofile
,
"%f "
,
probs
.
Get1D
(
i
));
fprintf
(
ofile
,
"||| %f
\n
"
,
prob
);
loss
+=
-
prob
;
wordCount
+=
ngramNum
;
sentCount
+=
1
;
}
fclose
(
file
);
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT1
(
0
,
stderr
,
"[INFO] ppl=%.2f
\n
"
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)
\n
"
,
elapsed
,
sentCount
,
wordCount
);
delete
[]
ngrams
;
}
};
source/tensor/Main.cpp
查看文件 @
2c4061e9
...
...
@@ -28,7 +28,7 @@
#include <time.h>
#include "XTensor.h"
#include "XDevice.h"
#include "./test/Test.h"
//
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./loss/CrossEntropy.h"
...
...
@@ -44,7 +44,7 @@ void LittleTest();
void
T2TTest
();
void
T2TTest2
();
void
PowerTest
();
void
Tests
();
int
main
(
int
argc
,
const
char
**
argv
)
{
//PowerTest();
...
...
@@ -63,7 +63,7 @@ int main( int argc, const char ** argv )
//return 0;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
Test
();
Test
s
();
else
{
fprintf
(
stderr
,
"Thanks for using NiuTrans.Tensor! This is a library that eases the
\n
"
);
fprintf
(
stderr
,
"use of tensors. All you need is to ...
\n\n
"
);
...
...
@@ -75,219 +75,223 @@ int main( int argc, const char ** argv )
return
0
;
}
void
myRead
(
XTensor
*
tensor
,
const
char
*
filename
,
const
char
*
label
)
{
FILE
*
file
=
fopen
(
filename
,
"rb"
);
if
(
file
==
NULL
)
printf
(
"%s
\n
"
,
filename
);
tensor
->
Read
(
file
,
label
);
}
void
myDump
(
XTensor
*
tensor
,
const
char
*
filename
,
const
char
*
label
)
{
FILE
*
file
=
fopen
(
filename
,
"wb"
);
if
(
file
==
NULL
)
printf
(
"%s
\n
"
,
filename
);
tensor
->
Dump
(
file
,
label
);
}
void
PowerTest
()
{
XTensor
input
;
XTensor
output
;
InitTensor2D
(
&
input
,
256
,
10000
,
X_FLOAT
,
0
);
InitTensor2D
(
&
output
,
256
,
10000
,
X_FLOAT
,
0
);
myRead
(
&
input
,
"1.txt"
,
""
);
_Power
(
&
input
,
&
output
,
2
);
output
.
Dump
(
stderr
,
""
,
200
);
}
void
SmallTest
()
{
XTensor
a
;
XTensor
b
;
XTensor
c
;
XTensor
d
;
InitTensor2D
(
&
a
,
2
,
2
);
InitTensor2D
(
&
b
,
2
,
2
);
a
.
SetZeroAll
();
b
.
SetZeroAll
();
a
.
Set2D
(
1.0
F
,
0
,
0
);
a
.
Set2D
(
2.0
F
,
1
,
1
);
b
=
Sum
(
a
,
Multiply
(
a
,
a
));
/* this is prohibited !!!!!!!!!!!!! */
//XTensor c = a * b + a;
//XTensor d = a + b + c.Lin(0.5F);
c
=
a
*
b
+
a
;
d
=
a
+
b
+
c
.
Lin
(
0.5
F
);
XLink
::
CheckNetwork
(
&
d
);
//XLink::ShowNetwork(stderr, &d);
a
.
Dump
(
stderr
,
"a:"
);
b
.
Dump
(
stderr
,
"b:"
);
c
.
Dump
(
stderr
,
"c:"
);
d
.
Dump
(
stderr
,
"d:"
);
}
void
TransposeTest
()
{
XTensor
a
;
XTensor
b
;
int
I
=
2
;
int
J
=
3
;
InitTensor4D
(
&
a
,
2
,
3
,
4
,
5
);
int
*
dims
=
new
int
[
a
.
order
];
memcpy
(
dims
,
a
.
dimSize
,
sizeof
(
int
)
*
a
.
order
);
dims
[
I
]
=
a
.
dimSize
[
J
];
dims
[
J
]
=
a
.
dimSize
[
I
];
InitTensor
(
&
b
,
4
,
dims
);
void
Tests
()
{
a
.
SetZeroAll
();
b
.
SetZeroAll
();
float
*
data
=
new
float
[
a
.
unitNum
];
for
(
int
i
=
0
;
i
<
a
.
unitNum
;
i
++
)
data
[
i
]
=
(
float
)
i
;
a
.
SetData
(
data
,
a
.
unitNum
,
0
);
_Transpose
(
&
a
,
&
b
,
I
,
J
);
b
.
Dump
(
stderr
,
"b:"
);
delete
[]
data
;
}
void
LittleTest
()
{
int
a
=
5000
;
int
b
=
100000
;
int
c
=
a
*
b
;
printf
(
"%d
\n
"
,
c
);
exit
(
1
);
}
void
T2TTest
()
{
XTensor
*
input
;
XTensor
*
weight
;
XTensor
*
output
;
XTensor
*
gold
;
XTensor
*
dedy
;
XTensor
*
dedx
;
XTensor
*
dedxTmp
;
XTensor
*
dedw
;
XTensor
*
padding
;
DTYPE
loss
;
int
*
dimSize
=
new
int
[
2
];
dimSize
[
0
]
=
256
;
dimSize
[
1
]
=
10001
;
int
*
dimSize2
=
new
int
[
3
];
dimSize2
[
0
]
=
2
;
dimSize2
[
1
]
=
31
;
dimSize2
[
2
]
=
256
;
int
*
dimSize3
=
new
int
[
3
];
dimSize3
[
0
]
=
2
;
dimSize3
[
1
]
=
31
;
dimSize3
[
2
]
=
10001
;
int
*
dimSize4
=
new
int
[
2
];
dimSize4
[
0
]
=
2
;
dimSize4
[
1
]
=
31
;
input
=
NewTensor
(
3
,
dimSize2
,
X_FLOAT
,
1.0
F
,
0
);
weight
=
NewTensor
(
2
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
dedw
=
NewTensor
(
2
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
gold
=
NewTensor
(
3
,
dimSize3
,
X_FLOAT
,
1.0
F
,
0
);
output
=
NewTensor
(
3
,
dimSize3
,
X_FLOAT
,
1.0
F
,
0
);
dedy
=
NewTensor
(
3
,
dimSize3
,
X_FLOAT
,
1.0
F
,
0
);
dedx
=
NewTensor
(
3
,
dimSize3
,
X_FLOAT
,
1.0
F
,
0
);
dedxTmp
=
NewTensor
(
3
,
dimSize3
,
X_FLOAT
,
1.0
F
,
0
);
padding
=
NewTensor
(
2
,
dimSize4
,
X_FLOAT
,
1.0
F
,
0
);
//weight = NewTensor(2, dimSize);
//dedw = NewTensor(2, dimSize);
//input = NewTensor(3, dimSize2);
//gold = NewTensor(3, dimSize3);
//output = NewTensor(3, dimSize3);
//dedy = NewTensor(3, dimSize3);
//dedx = NewTensor(3, dimSize3);
//dedxTmp = NewTensor(3, dimSize3);
//padding = NewTensor(2, dimSize4);
myRead
(
input
,
"x.txt"
,
"x"
);
myRead
(
weight
,
"w.txt"
,
"w"
);
myRead
(
gold
,
"gold.txt"
,
"gold"
);
myRead
(
padding
,
"padding.txt"
,
"padding"
);
XTensor
inter
;
inter
=
MMul
(
*
input
,
*
weight
);
_Softmax
(
&
inter
,
output
,
2
);
//_LogMe(output);
loss
=
_CrossEntropyFast
(
output
,
gold
,
REDUCE_MEAN
,
NULL
,
padding
);
printf
(
"loss: %f
\n
"
,
loss
);
_CrossEntropyBackward
(
dedy
,
output
,
gold
,
NULL
);
//_CrossEntropyBackward(dedy, output, gold, NULL, padding);
myDump
(
dedy
,
"dedy.txt"
,
"dedy"
);
_SoftmaxBackward
(
NULL
,
output
,
input
,
dedy
,
dedx
,
NULL
,
-
1
,
NOLOSS
);
_Sub
(
output
,
gold
,
dedxTmp
);
myDump
(
dedx
,
"dedx.txt"
,
"dedx"
);
dedx
->
Dump
(
stderr
,
"dedx"
,
200
);
dedxTmp
->
Dump
(
stderr
,
"dedxTmp"
,
200
);
input
->
Reshape
(
input
->
unitNum
/
input
->
GetDim
(
-
1
),
input
->
GetDim
(
-
1
));
dedx
->
Reshape
(
dedx
->
unitNum
/
dedx
->
GetDim
(
-
1
),
dedx
->
GetDim
(
-
1
));
_MatrixMulBatched
(
input
,
X_TRANS
,
dedx
,
X_NOTRANS
,
dedw
);
myDump
(
dedw
,
"dedw.txt"
,
"dedw"
);
}
void
T2TTest2
()
{
int
dimSize
[
3
];
dimSize
[
0
]
=
161
;
dimSize
[
1
]
=
47
;
dimSize
[
2
]
=
10001
;
XTensor
*
probs
=
NewTensor
(
3
,
dimSize
,
X_FLOAT
,
1.0
F
,
0
);
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//myRead(probs, "probs.txt", " ");
_SetDataFixedFloat
(
probs
,
1.0
F
);
probs
->
Reshape
(
1
,
probs
->
unitNum
);
DTYPE
sum
=
_ReduceSumAll
(
probs
);
printf
(
"%e
\n
"
,
sum
);
//XTensor tmp;
//tmp = IsNonZero(*probs);
//DTYPE nonZeroNum = ReduceSumAll(tmp);
//printf("%f\n", nonZeroNum);
//
//DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//printf("%e\n", gpu);
}
//void myRead(XTensor * tensor, const char * filename, const char * label)
//{
// FILE * file = fopen(filename, "rb");
// if(file == NULL)
// printf("%s\n", filename);
// tensor->Read(file, label);
//}
//
//void myDump(XTensor * tensor, const char * filename, const char * label)
//{
// FILE * file = fopen(filename, "wb");
// if(file == NULL)
// printf("%s\n", filename);
// tensor->Dump(file, label);
//}
//
//void PowerTest()
//{
// XTensor input;
// XTensor output;
// InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
// InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
// myRead(&input, "1.txt", "");
//
// _Power(&input, &output, 2);
// output.Dump(stderr, "", 200);
//}
//
//void SmallTest()
//{
// XTensor a;
// XTensor b;
// XTensor c;
// XTensor d;
//
// InitTensor2D(&a, 2, 2);
// InitTensor2D(&b, 2, 2);
// a.SetZeroAll();
// b.SetZeroAll();
// a.Set2D(1.0F, 0, 0);
// a.Set2D(2.0F, 1, 1);
//
// b = Sum(a, Multiply(a, a));
//
// /* this is prohibited !!!!!!!!!!!!! */
// //XTensor c = a * b + a;
// //XTensor d = a + b + c.Lin(0.5F);
//
// c = a * b + a;
// d = a + b + c.Lin(0.5F);
//
// XLink::CheckNetwork(&d);
// //XLink::ShowNetwork(stderr, &d);
//
// a.Dump(stderr, "a:");
// b.Dump(stderr, "b:");
// c.Dump(stderr, "c:");
// d.Dump(stderr, "d:");
//}
//
//void TransposeTest()
//{
// XTensor a;
// XTensor b;
//
// int I = 2;
// int J = 3;
//
// InitTensor4D(&a, 2, 3, 4, 5);
//
// int * dims = new int[a.order];
// memcpy(dims, a.dimSize, sizeof(int) * a.order);
// dims[I] = a.dimSize[J];
// dims[J] = a.dimSize[I];
//
// InitTensor(&b, 4, dims);
//
// a.SetZeroAll();
// b.SetZeroAll();
//
// float * data = new float[a.unitNum];
// for(int i = 0; i < a.unitNum; i++)
// data[i] = (float)i;
//
// a.SetData(data, a.unitNum, 0);
//
// _Transpose(&a, &b, I, J);
// b.Dump(stderr, "b:");
//
// delete[] data;
//}
//
//void LittleTest()
//{
// int a = 5000;
// int b = 100000;
// int c = a*b;
// printf("%d\n", c);
//
// exit(1);
//}
//
//void T2TTest()
//{
// XTensor * input;
// XTensor * weight;
// XTensor * output;
// XTensor * gold;
// XTensor * dedy;
// XTensor * dedx;
// XTensor * dedxTmp;
// XTensor * dedw;
// XTensor * padding;
//
// DTYPE loss;
//
// int * dimSize = new int[2];
// dimSize[0] = 256;
// dimSize[1] = 10001;
//
// int * dimSize2 = new int[3];
// dimSize2[0] = 2;
// dimSize2[1] = 31;
// dimSize2[2] = 256;
//
// int * dimSize3 = new int[3];
// dimSize3[0] = 2;
// dimSize3[1] = 31;
// dimSize3[2] = 10001;
//
// int * dimSize4 = new int[2];
// dimSize4[0] = 2;
// dimSize4[1] = 31;
//
// input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
// weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
//
// //weight = NewTensor(2, dimSize);
// //dedw = NewTensor(2, dimSize);
// //input = NewTensor(3, dimSize2);
// //gold = NewTensor(3, dimSize3);
// //output = NewTensor(3, dimSize3);
// //dedy = NewTensor(3, dimSize3);
// //dedx = NewTensor(3, dimSize3);
// //dedxTmp = NewTensor(3, dimSize3);
// //padding = NewTensor(2, dimSize4);
//
// myRead(input, "x.txt", "x");
// myRead(weight, "w.txt", "w");
// myRead(gold, "gold.txt", "gold");
// myRead(padding, "padding.txt", "padding");
//
// XTensor inter;
// inter = MMul(*input, *weight);
//
// _Softmax(&inter, output, 2);
//
// //_LogMe(output);
// loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
//
// printf("loss: %f\n", loss);
//
// _CrossEntropyBackward(dedy, output, gold, NULL);
// //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
//
// myDump(dedy, "dedy.txt", "dedy");
//
// _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
// _Sub(output, gold, dedxTmp);
//
// myDump(dedx, "dedx.txt", "dedx");
// dedx->Dump(stderr, "dedx", 200);
// dedxTmp->Dump(stderr, "dedxTmp", 200);
//
// input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
// dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
//
// _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
//
// myDump(dedw, "dedw.txt", "dedw");
//}
//
//void T2TTest2()
//{
// int dimSize[3];
// dimSize[0] = 161;
// dimSize[1] = 47;
// dimSize[2] = 10001;
// XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
// //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//
// //myRead(probs, "probs.txt", " ");
// _SetDataFixedFloat(probs, 1.0F);
//
// probs->Reshape(1, probs->unitNum);
//
// DTYPE sum = _ReduceSumAll(probs);
// printf("%e\n", sum);
//
// //XTensor tmp;
// //tmp = IsNonZero(*probs);
// //DTYPE nonZeroNum = ReduceSumAll(tmp);
// //printf("%f\n", nonZeroNum);
// //
// //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//
// //printf("%e\n", gpu);
//}
source/tensor/loss/CrossEntropy.cu
查看文件 @
2c4061e9
...
...
@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
delete[] dims;
}
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp;
}
else {
int num = dedy->unitNum / dedy->GetDim(n);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
}
//
if(padding != NULL) {
//
XTensor * tmp = NewTensor(padding);
//
_IsNonZero(padding, tmp);
//
int nonZeroNum = (int)_ReduceSumAll(tmp);
//
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
//
delete tmp;
//
}
//
else {
//
int num = dedy->unitNum / dedy->GetDim(n);
//
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
//
}
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论