Commit 2c4061e9 by ltb

fixed FNNLM of branch of xiao

parent 3800528b
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
#include "../tensor/XUtility.h" #include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h" #include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h" #include "../sample/transformer/Transformer.h"
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* *
* This is a simple impelementation of the feed-forward network-baesd language * This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in * model (FNNLM). See more details about FNNLM in
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "../../tensor/XDevice.h" #include "../../tensor/XDevice.h"
#include "../../tensor/function/FHeader.h" #include "../../tensor/function/FHeader.h"
#include "../../network/XNet.h" #include "../../network/XNet.h"
#include "../../tensor/core/math/ScaleAndShift.h"
namespace fnnlm namespace fnnlm
{ {
...@@ -39,50 +40,50 @@ namespace fnnlm ...@@ -39,50 +40,50 @@ namespace fnnlm
#define MAX_NAME_LENGTH 1024 #define MAX_NAME_LENGTH 1024
#define MAX_LINE_LENGTH_HERE 1024 * 32 #define MAX_LINE_LENGTH_HERE 1024 * 32
char trainFN[MAX_NAME_LENGTH] = ""; // file name of the training data char trainFN[MAX_NAME_LENGTH] = ""; // file name of the training data
char modelFN[MAX_NAME_LENGTH] = ""; // file name of the FNN model char modelFN[MAX_NAME_LENGTH] = ""; // file name of the FNN model
char testFN[MAX_NAME_LENGTH] = ""; // file name of the test data char testFN[MAX_NAME_LENGTH] = ""; // file name of the test data
char outputFN[MAX_NAME_LENGTH] = ""; // file name of the result data char outputFN[MAX_NAME_LENGTH] = ""; // file name of the result data
float learningRate = 0.01F; // learning rate float learningRate = 0.01F; // learning rate
int nStep = 10000000; // max learning steps (or model updates) int nStep = 10000000; // max learning steps (or model updates)
int nEpoch = 10; // max training epochs int nEpoch = 10; // max training epochs
float minmax = 0.08F; // range [-p,p] for parameter initialization float minmax = 0.08F; // range [-p,p] for parameter initialization
int sentBatch = 0; // batch size at the sentence level int sentBatch = 0; // batch size at the sentence level
int wordBatch = 1; // batch size at the word level int wordBatch = 1; // batch size at the word level
bool shuffled = false; // shuffled the training data file or not bool shuffled = false; // shuffled the training data file or not
bool autoDiff = false; // indicator of automatic differentiation bool autoDiff = false; // indicator of automatic differentiation
void LoadArgs(int argc, const char ** argv, FNNModel &model); void LoadArgs(int argc, const char ** argv, FNNModel &model);
void Init(FNNModel &model); void Init(FNNModel &model);
void Check(FNNModel &model); void Check(FNNModel &model);
void Copy(FNNModel &tgt, FNNModel &src); void Copy(FNNModel &tgt, FNNModel &src);
void Clear(FNNModel &model, bool isNodeGrad); void Clear(FNNModel &model, bool isNodeGrad);
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model); void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model); void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
void Train(const char * train, bool isShuffled, FNNModel &model); void Train(const char * train, bool isShuffled, FNNModel &model);
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad); void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL); float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
void Dump(const char * fn, FNNModel &model); void Dump(const char * fn, FNNModel &model);
void Read(const char * fn, FNNModel &model); void Read(const char * fn, FNNModel &model);
void Test(const char * test, const char * result, FNNModel &model); void Test(const char * test, const char * result, FNNModel &model);
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum); int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem); int itemNum, int devID, XMem * mem);
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem); void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net); void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net); FNNModel &model, FNNModel &grad, FNNNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model); void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model); void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
/* /*
entry of the program entry of the program
>> argc - number of the arguments >> argc - number of the arguments
>> argv - pointers to the arguments >> argv - pointers to the arguments
<< return - error code << return - error code
arguments: arguments:
-train S: specify training data file name -train S: specify training data file name
-model S: specify model file name -model S: specify model file name
-test S: specify test data file name -test S: specify test data file name
...@@ -110,10 +111,10 @@ arguments: ...@@ -110,10 +111,10 @@ arguments:
E.g., E.g.,
0 29 2 11 1 0 29 2 11 1
might be a line of the file. might be a line of the file.
*/ */
int FNNLMMain(int argc, const char ** argv) int FNNLMMain(int argc, const char ** argv)
{ {
if(argc == 0) if (argc == 0)
return 1; return 1;
FNNModel model; FNNModel model;
...@@ -128,127 +129,127 @@ int FNNLMMain(int argc, const char ** argv) ...@@ -128,127 +129,127 @@ int FNNLMMain(int argc, const char ** argv)
Init(model); Init(model);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if (strcmp(trainFN, ""))
Train(trainFN, shuffled, model); Train(trainFN, shuffled, model);
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) if (strcmp(modelFN, "") && strcmp(trainFN, ""))
Dump(modelFN, model); Dump(modelFN, model);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) if (strcmp(modelFN, ""))
Read(modelFN, model); Read(modelFN, model);
/* test the model on the new data */ /* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, "")) if (strcmp(testFN, "") && strcmp(outputFN, ""))
Test(testFN, outputFN, model); Test(testFN, outputFN, model);
return 0; return 0;
} }
/* /*
load arguments load arguments
>> argc - number of the arguments >> argc - number of the arguments
>> argv - pointers to the arguments >> argv - pointers to the arguments
>> model - the fnn model >> model - the fnn model
*/ */
void LoadArgs(int argc, const char ** argv, FNNModel &model) void LoadArgs(int argc, const char ** argv, FNNModel &model)
{ {
fprintf(stderr, "args:\n"); fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){ for (int i = 0; i < argc; i++) {
if(!strcmp(argv[i], "-train") && i + 1 < argc){ if (!strcmp(argv[i], "-train") && i + 1 < argc) {
strcpy(trainFN, argv[i + 1]); strcpy(trainFN, argv[i + 1]);
fprintf(stderr, " -train=%s\n", argv[i + 1]); fprintf(stderr, " -train=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-model") && i + 1 < argc){ if (!strcmp(argv[i], "-model") && i + 1 < argc) {
strcpy(modelFN, argv[i + 1]); strcpy(modelFN, argv[i + 1]);
fprintf(stderr, " -model=%s\n", argv[i + 1]); fprintf(stderr, " -model=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-test") && i + 1 < argc){ if (!strcmp(argv[i], "-test") && i + 1 < argc) {
strcpy(testFN, argv[i + 1]); strcpy(testFN, argv[i + 1]);
fprintf(stderr, " -test=%s\n", argv[i + 1]); fprintf(stderr, " -test=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-output") && i + 1 < argc){ if (!strcmp(argv[i], "-output") && i + 1 < argc) {
strcpy(outputFN, argv[i + 1]); strcpy(outputFN, argv[i + 1]);
fprintf(stderr, " -output=%s\n", argv[i + 1]); fprintf(stderr, " -output=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-n") && i + 1 < argc){ if (!strcmp(argv[i], "-n") && i + 1 < argc) {
model.n = atoi(argv[i + 1]); model.n = atoi(argv[i + 1]);
fprintf(stderr, " -n=%d\n", model.n); fprintf(stderr, " -n=%d\n", model.n);
} }
if(!strcmp(argv[i], "-esize") && i + 1 < argc){ if (!strcmp(argv[i], "-esize") && i + 1 < argc) {
model.eSize = atoi(argv[i + 1]); model.eSize = atoi(argv[i + 1]);
fprintf(stderr, " -esize=%d\n", model.eSize); fprintf(stderr, " -esize=%d\n", model.eSize);
} }
if(!strcmp(argv[i], "-vsize") && i + 1 < argc){ if (!strcmp(argv[i], "-vsize") && i + 1 < argc) {
model.vSize = atoi(argv[i + 1]); model.vSize = atoi(argv[i + 1]);
fprintf(stderr, " -vsize=%d\n", model.vSize); fprintf(stderr, " -vsize=%d\n", model.vSize);
} }
if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){ if (!strcmp(argv[i], "-hdepth") && i + 1 < argc) {
model.hDepth = atoi(argv[i + 1]); model.hDepth = atoi(argv[i + 1]);
fprintf(stderr, " -hdepth=%d\n", model.hDepth); fprintf(stderr, " -hdepth=%d\n", model.hDepth);
} }
if(!strcmp(argv[i], "-hsize") && i + 1 < argc){ if (!strcmp(argv[i], "-hsize") && i + 1 < argc) {
model.hSize = atoi(argv[i + 1]); model.hSize = atoi(argv[i + 1]);
fprintf(stderr, " -hsize=%d\n", model.hSize); fprintf(stderr, " -hsize=%d\n", model.hSize);
} }
if(!strcmp(argv[i], "-lrate") && i + 1 < argc){ if (!strcmp(argv[i], "-lrate") && i + 1 < argc) {
learningRate = (float)atof(argv[i + 1]); learningRate = (float)atof(argv[i + 1]);
fprintf(stderr, " -lrate=%f\n", learningRate); fprintf(stderr, " -lrate=%f\n", learningRate);
} }
if(!strcmp(argv[i], "-nstep") && i + 1 < argc){ if (!strcmp(argv[i], "-nstep") && i + 1 < argc) {
nStep = atoi(argv[i + 1]); nStep = atoi(argv[i + 1]);
fprintf(stderr, " -nstep=%d\n", nStep); fprintf(stderr, " -nstep=%d\n", nStep);
} }
if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){ if (!strcmp(argv[i], "-nepoch") && i + 1 < argc) {
nEpoch = atoi(argv[i + 1]); nEpoch = atoi(argv[i + 1]);
fprintf(stderr, " -nepoch=%d\n", nEpoch); fprintf(stderr, " -nepoch=%d\n", nEpoch);
} }
if(!strcmp(argv[i], "-minmax") && i + 1 < argc){ if (!strcmp(argv[i], "-minmax") && i + 1 < argc) {
minmax = (float)fabs(atof(argv[i + 1])); minmax = (float)fabs(atof(argv[i + 1]));
fprintf(stderr, " -minmax=%f\n", minmax); fprintf(stderr, " -minmax=%f\n", minmax);
} }
if(!strcmp(argv[i], "-batch") && i + 1 < argc){ if (!strcmp(argv[i], "-batch") && i + 1 < argc) {
sentBatch = atoi(argv[i + 1]); sentBatch = atoi(argv[i + 1]);
fprintf(stderr, " -batch=%d\n", sentBatch); fprintf(stderr, " -batch=%d\n", sentBatch);
} }
if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){ if (!strcmp(argv[i], "-wbatch") && i + 1 < argc) {
wordBatch = atoi(argv[i + 1]); wordBatch = atoi(argv[i + 1]);
fprintf(stderr, " -wbatch=%d\n", wordBatch); fprintf(stderr, " -wbatch=%d\n", wordBatch);
} }
if(!strcmp(argv[i], "-shuffle")){ if (!strcmp(argv[i], "-shuffle")) {
shuffled = true; shuffled = true;
fprintf(stderr, " -shuffle=true\n"); fprintf(stderr, " -shuffle=true\n");
} }
if(!strcmp(argv[i], "-autodiff")){ if (!strcmp(argv[i], "-autodiff")) {
autoDiff = true; autoDiff = true;
fprintf(stderr, " -autodiff=true\n"); fprintf(stderr, " -autodiff=true\n");
} }
if(!strcmp(argv[i], "-dev") && i + 1 < argc){ if (!strcmp(argv[i], "-dev") && i + 1 < argc) {
model.devID = atoi(argv[i + 1]); model.devID = atoi(argv[i + 1]);
fprintf(stderr, " -dev=%d\n", model.devID); fprintf(stderr, " -dev=%d\n", model.devID);
} }
} }
for(int i = 0; i < argc; i++){ for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], "-mempool")) if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID); model.mem = new XMem(model.devID);
} }
} }
/* check model settings */ /* check model settings */
void Check(FNNModel &model) void Check(FNNModel &model)
{ {
CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!"); CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!"); CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!"); CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
} }
/* make a hard copy of the fnn model */ /* make a hard copy of the fnn model */
void Copy(FNNModel &tgt, FNNModel &src) void Copy(FNNModel &tgt, FNNModel &src)
{ {
InitTensorV2(&tgt.embeddingW, &src.embeddingW); InitTensorV2(&tgt.embeddingW, &src.embeddingW);
for(int i = 0; i < MAX_HIDDEN_NUM; i++){ for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]); InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]); InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
} }
...@@ -262,33 +263,33 @@ void Copy(FNNModel &tgt, FNNModel &src) ...@@ -262,33 +263,33 @@ void Copy(FNNModel &tgt, FNNModel &src)
tgt.vSize = src.vSize; tgt.vSize = src.vSize;
tgt.devID = src.devID; tgt.devID = src.devID;
tgt.useMemPool = src.useMemPool; tgt.useMemPool = src.useMemPool;
if(src.mem != NULL){ if (src.mem != NULL) {
tgt.mem = new XMem(src.mem->devID, src.mem->mode, tgt.mem = new XMem(src.mem->devID, src.mem->mode,
src.mem->maxBlockSize, src.mem->blockNum, src.mem->maxBlockSize, src.mem->blockNum,
src.mem->bufSize); src.mem->bufSize);
} }
} }
/* /*
reset model parameters reset model parameters
>> model - the model whose parameter (gradient) is set to 0 >> model - the model whose parameter (gradient) is set to 0
>> isNodeGrad - indicates whether the tensor node keeps the >> isNodeGrad - indicates whether the tensor node keeps the
gradient information gradient information
*/ */
void Clear(FNNModel &model, bool isNodeGrad) void Clear(FNNModel &model, bool isNodeGrad)
{ {
if (isNodeGrad) { if (isNodeGrad) {
if(model.embeddingW.grad != NULL) if (model.embeddingW.grad != NULL)
model.embeddingW.grad->SetZeroAll(); model.embeddingW.grad->SetZeroAll();
for (int i = 0; i < MAX_HIDDEN_NUM; i++) { for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
if(model.hiddenW[i].grad != NULL) if (model.hiddenW[i].grad != NULL)
model.hiddenW[i].grad->SetZeroAll(); model.hiddenW[i].grad->SetZeroAll();
if(model.hiddenB[i].grad != NULL) if (model.hiddenB[i].grad != NULL)
model.hiddenB[i].grad->SetZeroAll(); model.hiddenB[i].grad->SetZeroAll();
} }
if(model.outputW.grad != NULL) if (model.outputW.grad != NULL)
model.outputW.grad->SetZeroAll(); model.outputW.grad->SetZeroAll();
if(model.outputB.grad != NULL) if (model.outputB.grad != NULL)
model.outputB.grad->SetZeroAll(); model.outputB.grad->SetZeroAll();
} }
else { else {
...@@ -300,76 +301,78 @@ void Clear(FNNModel &model, bool isNodeGrad) ...@@ -300,76 +301,78 @@ void Clear(FNNModel &model, bool isNodeGrad)
model.outputW.SetZeroAll(); model.outputW.SetZeroAll();
model.outputB.SetZeroAll(); model.outputB.SetZeroAll();
} }
} }
/* /*
initialize a 1d tensor using the fnn model setting initialize a 1d tensor using the fnn model setting
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
>> num - number of items >> num - number of items
>> model - the fnn model >> model - the fnn model
*/ */
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model) void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
{ {
InitTensor1DV2(&tensor, num, X_FLOAT, model.devID); InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
} }
/* /*
initialize a 2d tensor using the fnn model setting initialize a 2d tensor using the fnn model setting
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
>> rowNum - number of rows >> rowNum - number of rows
>> colNum - number of columns >> colNum - number of columns
>> model - the fnn model >> model - the fnn model
*/ */
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model) void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
{ {
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
} }
/* initialize the model */ /* initialize the model */
void Init(FNNModel &model) void Init(FNNModel &model)
{ {
/* create embedding parameter matrix: vSize * eSize */ /* create embedding parameter matrix: vSize * eSize */
InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model); InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
model.embeddingW.SetVarFlag();
/* create hidden layer parameter matrics */ /* create hidden layer parameter matrics */
for(int i = 0; i < model.hDepth; i++){ for (int i = 0; i < model.hDepth; i++) {
/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer /* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
hsize * hsize otherwise */ hsize * hsize otherwise */
if(i == 0) if (i == 0)
InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model); InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
else else
InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model); InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model);
model.hiddenW[i].SetVarFlag();
/* bias term: a row vector of hSize entries */ /* bias term: a row vector of hSize entries */
InitModelTensor1D(model.hiddenB[i], model.hSize, model); InitModelTensor1D(model.hiddenB[i], model.hSize, model);
model.hiddenB[i].SetVarFlag();
} }
/* create the output layer parameter matrix and bias term */ /* create the output layer parameter matrix and bias term */
int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize; int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
InitModelTensor2D(model.outputW, iSize, model.vSize, model); InitModelTensor2D(model.outputW, iSize, model.vSize, model);
InitModelTensor1D(model.outputB, model.vSize, model); InitModelTensor1D(model.outputB, model.vSize, model);
model.outputW.SetVarFlag();
model.outputB.SetVarFlag();
/* then, we initialize model parameters using a uniform distribution in range /* then, we initialize model parameters using a uniform distribution in range
of [-minmax, minmax] */ of [-minmax, minmax] */
model.embeddingW.SetDataRand(-minmax, minmax); model.embeddingW.SetDataRand(-minmax, minmax);
model.outputW.SetDataRand(-minmax, minmax); model.outputW.SetDataRand(-minmax, minmax);
for(int i = 0; i < model.hDepth; i++) for (int i = 0; i < model.hDepth; i++)
model.hiddenW[i].SetDataRand(-minmax, minmax); model.hiddenW[i].SetDataRand(-minmax, minmax);
/* all bias terms are set to zero */ /* all bias terms are set to zero */
model.outputB.SetZeroAll(); model.outputB.SetZeroAll();
for(int i = 0; i < model.hDepth; i++) for (int i = 0; i < model.hDepth; i++)
model.hiddenB[i].SetZeroAll(); model.hiddenB[i].SetZeroAll();
} }
/* /*
shuffle lines of the file shuffle lines of the file
>> srcFile - the source file to shuffle >> srcFile - the source file to shuffle
>> tgtFile - the resulting file >> tgtFile - the resulting file
*/ */
void Shuffle(const char * srcFile, const char * tgtFile) void Shuffle(const char * srcFile, const char * tgtFile)
{ {
char * line = new char[MAX_LINE_LENGTH_HERE]; char * line = new char[MAX_LINE_LENGTH_HERE];
#ifndef WIN32 #ifndef WIN32
sprintf(line, "shuf %s > %s", srcFile, tgtFile); sprintf(line, "shuf %s > %s", srcFile, tgtFile);
...@@ -379,23 +382,23 @@ void Shuffle(const char * srcFile, const char * tgtFile) ...@@ -379,23 +382,23 @@ void Shuffle(const char * srcFile, const char * tgtFile)
#endif #endif
delete[] line; delete[] line;
} }
char lineBuf[MAX_LINE_LENGTH_HERE]; char lineBuf[MAX_LINE_LENGTH_HERE];
int wordBuf[MAX_LINE_LENGTH_HERE]; int wordBuf[MAX_LINE_LENGTH_HERE];
/* /*
train the model with the standard SGD method train the model with the standard SGD method
>> train - training data file >> train - training data file
>> isShuffled - shuffle the data file or not >> isShuffled - shuffle the data file or not
>> model - the fnn model >> model - the fnn model
*/ */
void Train(const char * train, bool isShuffled, FNNModel &model) void Train(const char * train, bool isShuffled, FNNModel &model)
{ {
char name[MAX_NAME_LENGTH]; char name[MAX_NAME_LENGTH];
/* shuffle the data */ /* shuffle the data */
if(isShuffled){ if (isShuffled) {
sprintf(name, "%s-tmp", train); sprintf(name, "%s-tmp", train);
Shuffle(train, name); Shuffle(train, name);
} }
...@@ -420,9 +423,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -420,9 +423,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
XNet autoDiffer; XNet autoDiffer;
double startT = GetClockSec(); double startT = GetClockSec();
/* iterate for a number of epochs */ /* iterate for a number of epochs */
for(epoch = 0; epoch < nEpoch; epoch++){ for (epoch = 0; epoch < nEpoch; epoch++) {
/* data file */ /* data file */
FILE * file = fopen(name, "rb"); FILE * file = fopen(name, "rb");
...@@ -432,7 +434,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -432,7 +434,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
loss = 0; loss = 0;
ngramNum = 1; ngramNum = 1;
while(ngramNum > 0){ while (ngramNum > 0) {
/* load a minibatch of ngrams */ /* load a minibatch of ngrams */
ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch); ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);
...@@ -453,13 +455,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -453,13 +455,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
XTensor lossTensor; XTensor lossTensor;
/* make the input tensor for position i */ /* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++) for (int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem); MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
/* make the gold tensor */ /* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem); MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
if(!autoDiff){ if (!autoDiff) {
/* prepare an empty network for building the fnn */ /* prepare an empty network for building the fnn */
FNNNet net; FNNNet net;
...@@ -469,15 +471,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -469,15 +471,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* forward computation */ /* forward computation */
Forward(inputs, output, model, net); Forward(inputs, output, model, net);
/* backward computation to obtain gradients */ /* backward computation to obtain gradients */
Backward(inputs, output, gold, CROSSENTROPY, model, grad, net); Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
/* update model parameters */ /* update model parameters */
Update(model, grad, learningRate, false); Update(model, grad, learningRate, false);
} }
else{ else {
/* gradient = 0 */ /* gradient = 0 */
Clear(model, true); Clear(model, true);
...@@ -489,6 +489,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -489,6 +489,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* this is implemented by multiply function */ /* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model); //ForwardAutoDiff(inputs, output, model);
lossTensor = CrossEntropy(output, gold); lossTensor = CrossEntropy(output, gold);
output.Dump(stderr, "output:",10);
gold.Dump(stderr, "gold:", 10);
lossTensor.Dump(stderr, "lossTensor:",10);
/* automatic differentiation */ /* automatic differentiation */
autoDiffer.Backward(lossTensor); autoDiffer.Backward(lossTensor);
...@@ -500,14 +503,15 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -500,14 +503,15 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* get probabilities */ /* get probabilities */
float prob = GetProb(output, gold); float prob = GetProb(output, gold);
if (autoDiff) {
prob = ReduceSumAll(lossTensor); prob = -ReduceSumAll(lossTensor);
}
loss += prob; //printf("prob:%f", prob);
loss += -prob;
wordCount += ngramNum; wordCount += ngramNum;
wordCountTotal += ngramNum; wordCountTotal += ngramNum;
if(++step >= nStep){ if (++step >= nStep) {
isEnd = true; isEnd = true;
break; break;
} }
...@@ -521,7 +525,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -521,7 +525,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
fclose(file); fclose(file);
if(isEnd) if (isEnd)
break; break;
Test(testFN, outputFN, model); Test(testFN, outputFN, model);
...@@ -535,17 +539,17 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -535,17 +539,17 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
elapsed, step, epoch); elapsed, step, epoch);
delete[] ngrams; delete[] ngrams;
} }
/* /*
update the model parameters using the delta rule update the model parameters using the delta rule
>> model - the model to update >> model - the model to update
>> grad - gradients >> grad - gradients
>> epsilon - learning rate >> epsilon - learning rate
>> isNodeGrad - indicates whether the gradient is associated with the node >> isNodeGrad - indicates whether the gradient is associated with the node
*/ */
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
{ {
TensorList paraList(10); TensorList paraList(10);
TensorList gradList(10); TensorList gradList(10);
...@@ -559,7 +563,7 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) ...@@ -559,7 +563,7 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
paraList.Add(&model.embeddingW); paraList.Add(&model.embeddingW);
if(!isNodeGrad){ if (!isNodeGrad) {
gradList.Add(&grad.outputW); gradList.Add(&grad.outputW);
gradList.Add(&grad.outputB); gradList.Add(&grad.outputB);
...@@ -567,10 +571,10 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) ...@@ -567,10 +571,10 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
gradList.Add(&grad.hiddenW[i]); gradList.Add(&grad.hiddenW[i]);
gradList.Add(&grad.hiddenB[i]); gradList.Add(&grad.hiddenB[i]);
} }
; ;
gradList.Add(&grad.embeddingW); gradList.Add(&grad.embeddingW);
} }
else{ else {
gradList.Add(model.outputW.grad); gradList.Add(model.outputW.grad);
gradList.Add(model.outputB.grad); gradList.Add(model.outputB.grad);
...@@ -592,17 +596,17 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) ...@@ -592,17 +596,17 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -epsilon); _Sum(para, paraGrad, para, -epsilon);
} }
} }
/* /*
get prediction probabilites of the gold words get prediction probabilites of the gold words
>> output - output probabilities >> output - output probabilities
>> gold - gold standard >> gold - gold standard
>> wordPobs - probability of each word >> wordPobs - probability of each word
<< return - probability of the batch << return - probability of the batch
*/ */
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs) float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
{ {
XTensor probs; XTensor probs;
InitTensorV2(&probs, &output); InitTensorV2(&probs, &output);
...@@ -613,7 +617,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs) ...@@ -613,7 +617,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
XTensor wprobs; XTensor wprobs;
InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID); InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1); _ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL) if (wordProbs != NULL)
_CopyValues(&wprobs, wordProbs); _CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure /* reshape the tensor to fit it into the reduce procedure
...@@ -629,34 +633,34 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs) ...@@ -629,34 +633,34 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
_ReduceSum(&probs, &result, 1); _ReduceSum(&probs, &result, 1);
return result.Get1D(0); return result.Get1D(0);
} }
int pin = 0; int pin = 0;
int wordBufCount = 0; int wordBufCount = 0;
/* /*
load a minibatch of ngrams load a minibatch of ngrams
>> file - data file >> file - data file
>> n - order of the language model >> n - order of the language model
>> ngrams - the loaded ngrams >> ngrams - the loaded ngrams
>> sentNum - maximum sentences kept in the minibatch >> sentNum - maximum sentences kept in the minibatch
>> wordNum - maximum words kept in the minibatch >> wordNum - maximum words kept in the minibatch
*/ */
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum) int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
{ {
int num = 0; int num = 0;
int lineNum = 0; int lineNum = 0;
while(pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)){ while (pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)) {
if(pin <= 0){ if (pin <= 0) {
int len = (int)strlen(lineBuf); int len = (int)strlen(lineBuf);
while(lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n'){ while (lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n') {
lineBuf[len - 1] = 0; lineBuf[len - 1] = 0;
len--; len--;
} }
len = (int)strlen(lineBuf); len = (int)strlen(lineBuf);
if(len == 0) if (len == 0)
continue; continue;
/* how many characters are in a word */ /* how many characters are in a word */
...@@ -666,9 +670,9 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum) ...@@ -666,9 +670,9 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
int wNum = 0; int wNum = 0;
int i = 0; int i = 0;
for(i = pin; i < len; i++){ for (i = pin; i < len; i++) {
/* load word (id) seperated by space or tab */ /* load word (id) seperated by space or tab */
if((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0){ if ((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0) {
lineBuf[i] = 0; lineBuf[i] = 0;
wordBuf[wNum++] = atoi(lineBuf + i - wSize); wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wSize = 0; wSize = 0;
...@@ -677,7 +681,7 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum) ...@@ -677,7 +681,7 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
wSize++; wSize++;
} }
if(wSize > 0) if (wSize > 0)
wordBuf[wNum++] = atoi(lineBuf + i - wSize); wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wordBufCount = wNum; wordBufCount = wNum;
...@@ -689,69 +693,69 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum) ...@@ -689,69 +693,69 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
int i = -MAX_INT; int i = -MAX_INT;
/* create ngrams */ /* create ngrams */
for(i = MAX(pin, n - 1); i < wordBufCount - 1; i++){ for (i = MAX(pin, n - 1); i < wordBufCount - 1; i++) {
memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n); memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
if(num >= wordNum) if (num >= wordNum)
break; break;
} }
/* set a finished flag if we reach the end of the sentence*/ /* set a finished flag if we reach the end of the sentence*/
if(i >= wordBufCount - 1){ if (i >= wordBufCount - 1) {
pin = 0; pin = 0;
wordBufCount = 0; wordBufCount = 0;
} }
/* record where to start next time if we break in the middle */ /* record where to start next time if we break in the middle */
else{ else {
pin = i + 1; pin = i + 1;
} }
if((sentNum > 0 && lineNum >= sentNum) || num >= wordNum) if ((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
break; break;
} }
return num; return num;
} }
/* /*
make a 2d tensor in zero-one representation make a 2d tensor in zero-one representation
The indexed cell is set to 1, and 0 otherwise. The indexed cell is set to 1, and 0 otherwise.
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
>> rowNum - number of rows >> rowNum - number of rows
>> colNum - number of columns >> colNum - number of columns
>> rows - row index >> rows - row index
>> cols - column index >> cols - column index
>> itemNum - number of non-zero items >> itemNum - number of non-zero items
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
*/ */
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem) int itemNum, int devID, XMem * mem)
{ {
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
tensor.SetZeroAll(); tensor.SetZeroAll();
/* set none-zero cells */ /* set none-zero cells */
for(int i = 0; i < itemNum; i++) for (int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]); tensor.Set2D(1.0F, rows[i], cols[i]);
} }
/* /*
make a tensor that encodes a batch of words make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words >> batch - the tensor encoding a batch of words
>> ngrams - the ngram batch >> ngrams - the ngram batch
>> ngramNum - batch size >> ngramNum - batch size
>> n - indicate which word is encode for each ngram >> n - indicate which word is encode for each ngram
>> vSize - vocabulary size >> vSize - vocabulary size
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
*/ */
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem) void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
{ {
int * rows = new int[ngramNum]; int * rows = new int[ngramNum];
int * cols = new int[ngramNum]; int * cols = new int[ngramNum];
for(int i = 0; i < ngramNum; i++){ for (int i = 0; i < ngramNum; i++) {
rows[i] = i; rows[i] = i;
cols[i] = ngrams[i].words[n]; cols[i] = ngrams[i].words[n];
} }
...@@ -760,31 +764,31 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz ...@@ -760,31 +764,31 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
delete[] rows; delete[] rows;
delete[] cols; delete[] cols;
} }
/* /*
forward procedure forward procedure
>> inputs - input word representations >> inputs - input word representations
>> output - output probability >> output - output probability
>> model - the fnn model >> model - the fnn model
>> net - the network that keeps the internal tensors generated in the process >> net - the network that keeps the internal tensors generated in the process
*/ */
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
{ {
int batchSize = -1; int batchSize = -1;
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
TensorList eList(n - 1); TensorList eList(n - 1);
/* previoius n - 1 words */ /* previoius n - 1 words */
for(int i = 0; i < n - 1; i++){ for (int i = 0; i < n - 1; i++) {
XTensor &input = inputs[i]; XTensor &input = inputs[i];
XTensor &w = model.embeddingW; XTensor &w = model.embeddingW;
XTensor &embedding = net.embeddings[i]; XTensor &embedding = net.embeddings[i];
if(batchSize == -1) if (batchSize == -1)
batchSize = input.dimSize[0]; batchSize = input.dimSize[0];
else{ else {
CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!"); CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
} }
...@@ -804,7 +808,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -804,7 +808,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
_Concatenate(&eList, &net.embeddingCat, 1); _Concatenate(&eList, &net.embeddingCat, 1);
/* go over each hidden layer */ /* go over each hidden layer */
for(int i = 0; i < depth; i++){ for (int i = 0; i < depth; i++) {
XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1]; XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
XTensor &w = model.hiddenW[i]; XTensor &w = model.hiddenW[i];
XTensor &b = model.hiddenB[i]; XTensor &b = model.hiddenB[i];
...@@ -860,21 +864,21 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -860,21 +864,21 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* y = softmax(s) */ /* y = softmax(s) */
_LogSoftmax(&s, &y, 1); _LogSoftmax(&s, &y, 1);
} }
} }
/* /*
backward procedure backward procedure
>> inputs - input word representations >> inputs - input word representations
>> output - output probability >> output - output probability
>> gold - gold standard >> gold - gold standard
>> loss - loss function name >> loss - loss function name
>> model - the fnn model >> model - the fnn model
>> grad - the model that keeps the gradient information >> grad - the model that keeps the gradient information
>> net - the network that keeps the internal tensors generated in the process >> net - the network that keeps the internal tensors generated in the process
*/ */
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net) FNNModel &model, FNNModel &grad, FNNNet &net)
{ {
int batchSize = output.GetDim(0); int batchSize = output.GetDim(0);
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
...@@ -979,17 +983,17 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -979,17 +983,17 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
delete dedy; delete dedy;
} }
} }
/* /*
forward process (with tensor connections) (this is implemented by gather function) forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams >> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words >> batch - the tensor encoding a batch of words
>> output - output probability >> output - output probability
>> model - the fnn model >> model - the fnn model
*/ */
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model) void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{ {
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
...@@ -998,11 +1002,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -998,11 +1002,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
XTensor hidden; XTensor hidden;
XTensor b; XTensor b;
int size = batch * (n-1); int size = batch * (n - 1);
int * index = new int[size]; int * index = new int[size];
for(int i = 0; i < batch; i++){ for (int i = 0; i < batch; i++) {
for (int j = 0; j < n-1; j++){ for (int j = 0; j < n - 1; j++) {
int a = i * (n - 1) + j; int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j]; index[a] = ngrams[i].words[j];
} }
...@@ -1010,7 +1014,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1010,7 +1014,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
InitTensor1DV2(&words, size, X_INT, model.devID); InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size); words.SetData(index, size);
words.Dump(stderr, "word:", 10);
embeddingBig = Gather(model.embeddingW, words); embeddingBig = Gather(model.embeddingW, words);
delete[] index; delete[] index;
...@@ -1018,26 +1022,26 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1018,26 +1022,26 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
int dimSize[2]; int dimSize[2];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1); dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1); dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
embeddingBig.Dump(stderr, "embeddingBig:", 10);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
hidden.Dump(stderr, "hidden-0:", 10);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for (int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
hidden.Dump(stderr, "hidden-1:", 10);
/* output layer */ /* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1); output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
/* /*
forward process (with tensor connections) (this is implemented by multiply function) forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations >> inputs - input word representations
>> output - output probability >> output - output probability
>> model - the fnn model >> model - the fnn model
*/ */
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
{ {
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
...@@ -1047,7 +1051,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) ...@@ -1047,7 +1051,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
XTensor b; XTensor b;
TensorList inputList(n - 1); TensorList inputList(n - 1);
for(int i = 0; i < n - 1; i++) for (int i = 0; i < n - 1; i++)
inputList.Add(inputs + i); inputList.Add(inputs + i);
/* represent n - 1 words in one tensor */ /* represent n - 1 words in one tensor */
...@@ -1061,21 +1065,21 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) ...@@ -1061,21 +1065,21 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
hidden = Merge(hidden, 2, 0); hidden = Merge(hidden, 2, 0);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for (int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]; hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
/* /*
dump the model to the disk space dump the model to the disk space
>> fn - where to keep the model >> fn - where to keep the model
>> model - the fnn model >> model - the fnn model
*/ */
void Dump(const char * fn, FNNModel &model) void Dump(const char * fn, FNNModel &model)
{ {
FILE * file = fopen(fn, "wb"); FILE * file = fopen(fn, "wb");
CheckErrors(file, "Cannot open the model file"); CheckErrors(file, "Cannot open the model file");
...@@ -1094,15 +1098,15 @@ void Dump(const char * fn, FNNModel &model) ...@@ -1094,15 +1098,15 @@ void Dump(const char * fn, FNNModel &model)
fclose(file); fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n"); XPRINT(0, stderr, "[INFO] model saved\n");
} }
/* /*
read the model from the disk space read the model from the disk space
>> fn - where to keep the model >> fn - where to keep the model
>> model - the fnn model >> model - the fnn model
*/ */
void Read(const char * fn, FNNModel &model) void Read(const char * fn, FNNModel &model)
{ {
FILE * file = fopen(fn, "rb"); FILE * file = fopen(fn, "rb");
CheckErrors(file, "Cannot open the model file"); CheckErrors(file, "Cannot open the model file");
...@@ -1121,16 +1125,16 @@ void Read(const char * fn, FNNModel &model) ...@@ -1121,16 +1125,16 @@ void Read(const char * fn, FNNModel &model)
fclose(file); fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n"); XPRINT(0, stderr, "[INFO] model loaded\n");
} }
/* /*
test the model test the model
>> test - test data file >> test - test data file
>> result - where to keep the result >> result - where to keep the result
>> model - the fnn model >> model - the fnn model
*/ */
void Test(const char * test, const char * result, FNNModel &model) void Test(const char * test, const char * result, FNNModel &model)
{ {
int wordCount = 0; int wordCount = 0;
int sentCount = 0; int sentCount = 0;
float loss = 0; float loss = 0;
...@@ -1173,14 +1177,13 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1173,14 +1177,13 @@ void Test(const char * test, const char * result, FNNModel &model)
if (!autoDiff) { if (!autoDiff) {
/* prepare an empty network for building the fnn */ /* prepare an empty network for building the fnn */
FNNNet net; FNNNet net;
/* forward computation */ /* forward computation */
Forward(inputs, output, model, net); Forward(inputs, output, model, net);
} }
else { else {
/* this is implemented by gather function */ /* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model); ForwardAutoDiff(ngrams, ngramNum, output, model);
output = Log(output);
/* this is implemented by multiply function */ /* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model); //ForwardAutoDiff(inputs, output, model);
} }
...@@ -1213,11 +1216,11 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1213,11 +1216,11 @@ void Test(const char * test, const char * result, FNNModel &model)
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss/wordCount)); XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n", XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n",
elapsed, sentCount, wordCount); elapsed, sentCount, wordCount);
delete[] ngrams; delete[] ngrams;
} }
}; };
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include <time.h> #include <time.h>
#include "XTensor.h" #include "XTensor.h"
#include "XDevice.h" #include "XDevice.h"
#include "./test/Test.h" //#include "./test/Test.h"
#include "./core/CHeader.h" #include "./core/CHeader.h"
#include "./loss/CrossEntropy.h" #include "./loss/CrossEntropy.h"
...@@ -44,7 +44,7 @@ void LittleTest(); ...@@ -44,7 +44,7 @@ void LittleTest();
void T2TTest(); void T2TTest();
void T2TTest2(); void T2TTest2();
void PowerTest(); void PowerTest();
void Tests();
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
//PowerTest(); //PowerTest();
...@@ -63,7 +63,7 @@ int main( int argc, const char ** argv ) ...@@ -63,7 +63,7 @@ int main( int argc, const char ** argv )
//return 0; //return 0;
if(argc > 1 && !strcmp(argv[1], "-test")) if(argc > 1 && !strcmp(argv[1], "-test"))
Test(); Tests();
else{ else{
fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n"); fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
fprintf(stderr, "use of tensors. All you need is to ... \n\n"); fprintf(stderr, "use of tensors. All you need is to ... \n\n");
...@@ -75,219 +75,223 @@ int main( int argc, const char ** argv ) ...@@ -75,219 +75,223 @@ int main( int argc, const char ** argv )
return 0; return 0;
} }
void myRead(XTensor * tensor, const char * filename, const char * label) void Tests() {
{
FILE * file = fopen(filename, "rb");
if(file == NULL)
printf("%s\n", filename);
tensor->Read(file, label);
}
void myDump(XTensor * tensor, const char * filename, const char * label)
{
FILE * file = fopen(filename, "wb");
if(file == NULL)
printf("%s\n", filename);
tensor->Dump(file, label);
}
void PowerTest()
{
XTensor input;
XTensor output;
InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
myRead(&input, "1.txt", "");
_Power(&input, &output, 2);
output.Dump(stderr, "", 200);
}
void SmallTest()
{
XTensor a;
XTensor b;
XTensor c;
XTensor d;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 1, 1);
b = Sum(a, Multiply(a, a));
/* this is prohibited !!!!!!!!!!!!! */
//XTensor c = a * b + a;
//XTensor d = a + b + c.Lin(0.5F);
c = a * b + a;
d = a + b + c.Lin(0.5F);
XLink::CheckNetwork(&d);
//XLink::ShowNetwork(stderr, &d);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
c.Dump(stderr, "c:");
d.Dump(stderr, "d:");
}
void TransposeTest()
{
XTensor a;
XTensor b;
int I = 2;
int J = 3;
InitTensor4D(&a, 2, 3, 4, 5);
int * dims = new int[a.order];
memcpy(dims, a.dimSize, sizeof(int) * a.order);
dims[I] = a.dimSize[J];
dims[J] = a.dimSize[I];
InitTensor(&b, 4, dims);
a.SetZeroAll();
b.SetZeroAll();
float * data = new float[a.unitNum];
for(int i = 0; i < a.unitNum; i++)
data[i] = (float)i;
a.SetData(data, a.unitNum, 0);
_Transpose(&a, &b, I, J);
b.Dump(stderr, "b:");
delete[] data;
}
void LittleTest()
{
int a = 5000;
int b = 100000;
int c = a*b;
printf("%d\n", c);
exit(1);
} }
void T2TTest() //void myRead(XTensor * tensor, const char * filename, const char * label)
{ //{
XTensor * input; // FILE * file = fopen(filename, "rb");
XTensor * weight; // if(file == NULL)
XTensor * output; // printf("%s\n", filename);
XTensor * gold; // tensor->Read(file, label);
XTensor * dedy; //}
XTensor * dedx; //
XTensor * dedxTmp; //void myDump(XTensor * tensor, const char * filename, const char * label)
XTensor * dedw; //{
XTensor * padding; // FILE * file = fopen(filename, "wb");
// if(file == NULL)
DTYPE loss; // printf("%s\n", filename);
// tensor->Dump(file, label);
int * dimSize = new int[2]; //}
dimSize[0] = 256; //
dimSize[1] = 10001; //void PowerTest()
//{
int * dimSize2 = new int[3]; // XTensor input;
dimSize2[0] = 2; // XTensor output;
dimSize2[1] = 31; // InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
dimSize2[2] = 256; // InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
// myRead(&input, "1.txt", "");
int * dimSize3 = new int[3]; //
dimSize3[0] = 2; // _Power(&input, &output, 2);
dimSize3[1] = 31; // output.Dump(stderr, "", 200);
dimSize3[2] = 10001; //}
//
int * dimSize4 = new int[2]; //void SmallTest()
dimSize4[0] = 2; //{
dimSize4[1] = 31; // XTensor a;
// XTensor b;
input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0); // XTensor c;
weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0); // XTensor d;
dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0); //
gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // InitTensor2D(&a, 2, 2);
output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // InitTensor2D(&b, 2, 2);
dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // a.SetZeroAll();
dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // b.SetZeroAll();
dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // a.Set2D(1.0F, 0, 0);
padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0); // a.Set2D(2.0F, 1, 1);
//
//weight = NewTensor(2, dimSize); // b = Sum(a, Multiply(a, a));
//dedw = NewTensor(2, dimSize); //
//input = NewTensor(3, dimSize2); // /* this is prohibited !!!!!!!!!!!!! */
//gold = NewTensor(3, dimSize3); // //XTensor c = a * b + a;
//output = NewTensor(3, dimSize3); // //XTensor d = a + b + c.Lin(0.5F);
//dedy = NewTensor(3, dimSize3); //
//dedx = NewTensor(3, dimSize3); // c = a * b + a;
//dedxTmp = NewTensor(3, dimSize3); // d = a + b + c.Lin(0.5F);
//padding = NewTensor(2, dimSize4); //
// XLink::CheckNetwork(&d);
myRead(input, "x.txt", "x"); // //XLink::ShowNetwork(stderr, &d);
myRead(weight, "w.txt", "w"); //
myRead(gold, "gold.txt", "gold"); // a.Dump(stderr, "a:");
myRead(padding, "padding.txt", "padding"); // b.Dump(stderr, "b:");
// c.Dump(stderr, "c:");
XTensor inter; // d.Dump(stderr, "d:");
inter = MMul(*input, *weight); //}
//
_Softmax(&inter, output, 2); //void TransposeTest()
//{
//_LogMe(output); // XTensor a;
loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding); // XTensor b;
//
printf("loss: %f\n", loss); // int I = 2;
// int J = 3;
_CrossEntropyBackward(dedy, output, gold, NULL); //
//_CrossEntropyBackward(dedy, output, gold, NULL, padding); // InitTensor4D(&a, 2, 3, 4, 5);
//
myDump(dedy, "dedy.txt", "dedy"); // int * dims = new int[a.order];
// memcpy(dims, a.dimSize, sizeof(int) * a.order);
_SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS); // dims[I] = a.dimSize[J];
_Sub(output, gold, dedxTmp); // dims[J] = a.dimSize[I];
//
myDump(dedx, "dedx.txt", "dedx"); // InitTensor(&b, 4, dims);
dedx->Dump(stderr, "dedx", 200); //
dedxTmp->Dump(stderr, "dedxTmp", 200); // a.SetZeroAll();
// b.SetZeroAll();
input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1)); //
dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1)); // float * data = new float[a.unitNum];
// for(int i = 0; i < a.unitNum; i++)
_MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw); // data[i] = (float)i;
//
myDump(dedw, "dedw.txt", "dedw"); // a.SetData(data, a.unitNum, 0);
} //
// _Transpose(&a, &b, I, J);
void T2TTest2() // b.Dump(stderr, "b:");
{ //
int dimSize[3]; // delete[] data;
dimSize[0] = 161; //}
dimSize[1] = 47; //
dimSize[2] = 10001; //void LittleTest()
XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0); //{
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1); // int a = 5000;
// int b = 100000;
//myRead(probs, "probs.txt", " "); // int c = a*b;
_SetDataFixedFloat(probs, 1.0F); // printf("%d\n", c);
//
probs->Reshape(1, probs->unitNum); // exit(1);
//}
DTYPE sum = _ReduceSumAll(probs); //
printf("%e\n", sum); //void T2TTest()
//{
//XTensor tmp; // XTensor * input;
//tmp = IsNonZero(*probs); // XTensor * weight;
//DTYPE nonZeroNum = ReduceSumAll(tmp); // XTensor * output;
//printf("%f\n", nonZeroNum); // XTensor * gold;
// // XTensor * dedy;
//DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0); // XTensor * dedx;
// XTensor * dedxTmp;
//printf("%e\n", gpu); // XTensor * dedw;
} // XTensor * padding;
//
// DTYPE loss;
//
// int * dimSize = new int[2];
// dimSize[0] = 256;
// dimSize[1] = 10001;
//
// int * dimSize2 = new int[3];
// dimSize2[0] = 2;
// dimSize2[1] = 31;
// dimSize2[2] = 256;
//
// int * dimSize3 = new int[3];
// dimSize3[0] = 2;
// dimSize3[1] = 31;
// dimSize3[2] = 10001;
//
// int * dimSize4 = new int[2];
// dimSize4[0] = 2;
// dimSize4[1] = 31;
//
// input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
// weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
//
// //weight = NewTensor(2, dimSize);
// //dedw = NewTensor(2, dimSize);
// //input = NewTensor(3, dimSize2);
// //gold = NewTensor(3, dimSize3);
// //output = NewTensor(3, dimSize3);
// //dedy = NewTensor(3, dimSize3);
// //dedx = NewTensor(3, dimSize3);
// //dedxTmp = NewTensor(3, dimSize3);
// //padding = NewTensor(2, dimSize4);
//
// myRead(input, "x.txt", "x");
// myRead(weight, "w.txt", "w");
// myRead(gold, "gold.txt", "gold");
// myRead(padding, "padding.txt", "padding");
//
// XTensor inter;
// inter = MMul(*input, *weight);
//
// _Softmax(&inter, output, 2);
//
// //_LogMe(output);
// loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
//
// printf("loss: %f\n", loss);
//
// _CrossEntropyBackward(dedy, output, gold, NULL);
// //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
//
// myDump(dedy, "dedy.txt", "dedy");
//
// _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
// _Sub(output, gold, dedxTmp);
//
// myDump(dedx, "dedx.txt", "dedx");
// dedx->Dump(stderr, "dedx", 200);
// dedxTmp->Dump(stderr, "dedxTmp", 200);
//
// input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
// dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
//
// _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
//
// myDump(dedw, "dedw.txt", "dedw");
//}
//
//void T2TTest2()
//{
// int dimSize[3];
// dimSize[0] = 161;
// dimSize[1] = 47;
// dimSize[2] = 10001;
// XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
// //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//
// //myRead(probs, "probs.txt", " ");
// _SetDataFixedFloat(probs, 1.0F);
//
// probs->Reshape(1, probs->unitNum);
//
// DTYPE sum = _ReduceSumAll(probs);
// printf("%e\n", sum);
//
// //XTensor tmp;
// //tmp = IsNonZero(*probs);
// //DTYPE nonZeroNum = ReduceSumAll(tmp);
// //printf("%f\n", nonZeroNum);
// //
// //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//
// //printf("%e\n", gpu);
//}
...@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, ...@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
delete[] dims; delete[] dims;
} }
if(padding != NULL) { //if(padding != NULL) {
XTensor * tmp = NewTensor(padding); // XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp); // _IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp); // int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp; // delete tmp;
} //}
else { //else {
int num = dedy->unitNum / dedy->GetDim(n); // int num = dedy->unitNum / dedy->GetDim(n);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
} //}
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论