Commit 2c4061e9 by ltb

fixed FNNLM of branch of xiao

parent 3800528b
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
#include "../tensor/XUtility.h" #include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h" #include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h" #include "../sample/transformer/Transformer.h"
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University. * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -15,15 +15,15 @@ ...@@ -15,15 +15,15 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* *
* This is a simple impelementation of the feed-forward network-baesd language * This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in * model (FNNLM). See more details about FNNLM in
* "A Neural Probabilistic Language Model" by Bengio et al. * "A Neural Probabilistic Language Model" by Bengio et al.
* Journal of Machine Learning Research 3 (2003) 1137C1155 * Journal of Machine Learning Research 3 (2003) 1137C1155
* *
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
*/ */
#include <math.h> #include <math.h>
#include "FNNLM.h" #include "FNNLM.h"
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "../../tensor/XDevice.h" #include "../../tensor/XDevice.h"
#include "../../tensor/function/FHeader.h" #include "../../tensor/function/FHeader.h"
#include "../../network/XNet.h" #include "../../network/XNet.h"
#include "../../tensor/core/math/ScaleAndShift.h"
namespace fnnlm namespace fnnlm
{ {
...@@ -39,1185 +40,1187 @@ namespace fnnlm ...@@ -39,1185 +40,1187 @@ namespace fnnlm
#define MAX_NAME_LENGTH 1024 #define MAX_NAME_LENGTH 1024
#define MAX_LINE_LENGTH_HERE 1024 * 32 #define MAX_LINE_LENGTH_HERE 1024 * 32
char trainFN[MAX_NAME_LENGTH] = ""; // file name of the training data char trainFN[MAX_NAME_LENGTH] = ""; // file name of the training data
char modelFN[MAX_NAME_LENGTH] = ""; // file name of the FNN model char modelFN[MAX_NAME_LENGTH] = ""; // file name of the FNN model
char testFN[MAX_NAME_LENGTH] = ""; // file name of the test data char testFN[MAX_NAME_LENGTH] = ""; // file name of the test data
char outputFN[MAX_NAME_LENGTH] = ""; // file name of the result data char outputFN[MAX_NAME_LENGTH] = ""; // file name of the result data
float learningRate = 0.01F; // learning rate float learningRate = 0.01F; // learning rate
int nStep = 10000000; // max learning steps (or model updates) int nStep = 10000000; // max learning steps (or model updates)
int nEpoch = 10; // max training epochs int nEpoch = 10; // max training epochs
float minmax = 0.08F; // range [-p,p] for parameter initialization float minmax = 0.08F; // range [-p,p] for parameter initialization
int sentBatch = 0; // batch size at the sentence level int sentBatch = 0; // batch size at the sentence level
int wordBatch = 1; // batch size at the word level int wordBatch = 1; // batch size at the word level
bool shuffled = false; // shuffled the training data file or not bool shuffled = false; // shuffled the training data file or not
bool autoDiff = false; // indicator of automatic differentiation bool autoDiff = false; // indicator of automatic differentiation
void LoadArgs(int argc, const char ** argv, FNNModel &model); void LoadArgs(int argc, const char ** argv, FNNModel &model);
void Init(FNNModel &model); void Init(FNNModel &model);
void Check(FNNModel &model); void Check(FNNModel &model);
void Copy(FNNModel &tgt, FNNModel &src); void Copy(FNNModel &tgt, FNNModel &src);
void Clear(FNNModel &model, bool isNodeGrad); void Clear(FNNModel &model, bool isNodeGrad);
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model); void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model); void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
void Train(const char * train, bool isShuffled, FNNModel &model); void Train(const char * train, bool isShuffled, FNNModel &model);
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad); void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL); float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
void Dump(const char * fn, FNNModel &model); void Dump(const char * fn, FNNModel &model);
void Read(const char * fn, FNNModel &model); void Read(const char * fn, FNNModel &model);
void Test(const char * test, const char * result, FNNModel &model); void Test(const char * test, const char * result, FNNModel &model);
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum); int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem); int itemNum, int devID, XMem * mem);
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem); void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net); void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net); FNNModel &model, FNNModel &grad, FNNNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model); void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model); void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
/* /*
entry of the program entry of the program
>> argc - number of the arguments >> argc - number of the arguments
>> argv - pointers to the arguments >> argv - pointers to the arguments
<< return - error code << return - error code
arguments: arguments:
-train S: specify training data file name -train S: specify training data file name
-model S: specify model file name -model S: specify model file name
-test S: specify test data file name -test S: specify test data file name
-output S: specify result data file name -output S: specify result data file name
-n D: order of the language model -n D: order of the language model
-eSize D: embedding size -eSize D: embedding size
-vSize D: vocabulary size -vSize D: vocabulary size
-hdepth D: number of stacked hidden layers -hdepth D: number of stacked hidden layers
-hsize D: size of each hidden layer -hsize D: size of each hidden layer
-lrate F: learning rate -lrate F: learning rate
-nstep D: maximum number of model updates -nstep D: maximum number of model updates
-nepoch D: maximum number of training epochs -nepoch D: maximum number of training epochs
-batch D: batch size (how many sentences) -batch D: batch size (how many sentences)
-wbatch D: batch size at the word level -wbatch D: batch size at the word level
(how many words) (how many words)
-shuffle: shuffle the training data -shuffle: shuffle the training data
-devid D: the id of the device used -devid D: the id of the device used
-1: CPU, >=0: GPUs -1: CPU, >=0: GPUs
-mempool: use memory pools for memory management -mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training -autodiff: use automatic differentiation for training
where S=string, D=integer and F=float. where S=string, D=integer and F=float.
All words in the training and test data files All words in the training and test data files
are encoded as thire indeces in the vocabulary. are encoded as thire indeces in the vocabulary.
E.g., E.g.,
0 29 2 11 1 0 29 2 11 1
might be a line of the file. might be a line of the file.
*/ */
int FNNLMMain(int argc, const char ** argv) int FNNLMMain(int argc, const char ** argv)
{ {
if(argc == 0) if (argc == 0)
return 1; return 1;
FNNModel model; FNNModel model;
/* load arguments */ /* load arguments */
LoadArgs(argc, argv, model); LoadArgs(argc, argv, model);
/* check the setting */ /* check the setting */
Check(model); Check(model);
/* initialize model parameters */ /* initialize model parameters */
Init(model); Init(model);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if (strcmp(trainFN, ""))
Train(trainFN, shuffled, model); Train(trainFN, shuffled, model);
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) if (strcmp(modelFN, "") && strcmp(trainFN, ""))
Dump(modelFN, model); Dump(modelFN, model);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) if (strcmp(modelFN, ""))
Read(modelFN, model); Read(modelFN, model);
/* test the model on the new data */ /* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, "")) if (strcmp(testFN, "") && strcmp(outputFN, ""))
Test(testFN, outputFN, model); Test(testFN, outputFN, model);
return 0; return 0;
} }
/* /*
load arguments load arguments
>> argc - number of the arguments >> argc - number of the arguments
>> argv - pointers to the arguments >> argv - pointers to the arguments
>> model - the fnn model >> model - the fnn model
*/ */
void LoadArgs(int argc, const char ** argv, FNNModel &model) void LoadArgs(int argc, const char ** argv, FNNModel &model)
{ {
fprintf(stderr, "args:\n"); fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){ for (int i = 0; i < argc; i++) {
if(!strcmp(argv[i], "-train") && i + 1 < argc){ if (!strcmp(argv[i], "-train") && i + 1 < argc) {
strcpy(trainFN, argv[i + 1]); strcpy(trainFN, argv[i + 1]);
fprintf(stderr, " -train=%s\n", argv[i + 1]); fprintf(stderr, " -train=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-model") && i + 1 < argc){ if (!strcmp(argv[i], "-model") && i + 1 < argc) {
strcpy(modelFN, argv[i + 1]); strcpy(modelFN, argv[i + 1]);
fprintf(stderr, " -model=%s\n", argv[i + 1]); fprintf(stderr, " -model=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-test") && i + 1 < argc){ if (!strcmp(argv[i], "-test") && i + 1 < argc) {
strcpy(testFN, argv[i + 1]); strcpy(testFN, argv[i + 1]);
fprintf(stderr, " -test=%s\n", argv[i + 1]); fprintf(stderr, " -test=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-output") && i + 1 < argc){ if (!strcmp(argv[i], "-output") && i + 1 < argc) {
strcpy(outputFN, argv[i + 1]); strcpy(outputFN, argv[i + 1]);
fprintf(stderr, " -output=%s\n", argv[i + 1]); fprintf(stderr, " -output=%s\n", argv[i + 1]);
} }
if(!strcmp(argv[i], "-n") && i + 1 < argc){ if (!strcmp(argv[i], "-n") && i + 1 < argc) {
model.n = atoi(argv[i + 1]); model.n = atoi(argv[i + 1]);
fprintf(stderr, " -n=%d\n", model.n); fprintf(stderr, " -n=%d\n", model.n);
} }
if(!strcmp(argv[i], "-esize") && i + 1 < argc){ if (!strcmp(argv[i], "-esize") && i + 1 < argc) {
model.eSize = atoi(argv[i + 1]); model.eSize = atoi(argv[i + 1]);
fprintf(stderr, " -esize=%d\n", model.eSize); fprintf(stderr, " -esize=%d\n", model.eSize);
} }
if(!strcmp(argv[i], "-vsize") && i + 1 < argc){ if (!strcmp(argv[i], "-vsize") && i + 1 < argc) {
model.vSize = atoi(argv[i + 1]); model.vSize = atoi(argv[i + 1]);
fprintf(stderr, " -vsize=%d\n", model.vSize); fprintf(stderr, " -vsize=%d\n", model.vSize);
} }
if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){ if (!strcmp(argv[i], "-hdepth") && i + 1 < argc) {
model.hDepth = atoi(argv[i + 1]); model.hDepth = atoi(argv[i + 1]);
fprintf(stderr, " -hdepth=%d\n", model.hDepth); fprintf(stderr, " -hdepth=%d\n", model.hDepth);
} }
if(!strcmp(argv[i], "-hsize") && i + 1 < argc){ if (!strcmp(argv[i], "-hsize") && i + 1 < argc) {
model.hSize = atoi(argv[i + 1]); model.hSize = atoi(argv[i + 1]);
fprintf(stderr, " -hsize=%d\n", model.hSize); fprintf(stderr, " -hsize=%d\n", model.hSize);
} }
if(!strcmp(argv[i], "-lrate") && i + 1 < argc){ if (!strcmp(argv[i], "-lrate") && i + 1 < argc) {
learningRate = (float)atof(argv[i + 1]); learningRate = (float)atof(argv[i + 1]);
fprintf(stderr, " -lrate=%f\n", learningRate); fprintf(stderr, " -lrate=%f\n", learningRate);
} }
if(!strcmp(argv[i], "-nstep") && i + 1 < argc){ if (!strcmp(argv[i], "-nstep") && i + 1 < argc) {
nStep = atoi(argv[i + 1]); nStep = atoi(argv[i + 1]);
fprintf(stderr, " -nstep=%d\n", nStep); fprintf(stderr, " -nstep=%d\n", nStep);
} }
if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){ if (!strcmp(argv[i], "-nepoch") && i + 1 < argc) {
nEpoch = atoi(argv[i + 1]); nEpoch = atoi(argv[i + 1]);
fprintf(stderr, " -nepoch=%d\n", nEpoch); fprintf(stderr, " -nepoch=%d\n", nEpoch);
} }
if(!strcmp(argv[i], "-minmax") && i + 1 < argc){ if (!strcmp(argv[i], "-minmax") && i + 1 < argc) {
minmax = (float)fabs(atof(argv[i + 1])); minmax = (float)fabs(atof(argv[i + 1]));
fprintf(stderr, " -minmax=%f\n", minmax); fprintf(stderr, " -minmax=%f\n", minmax);
} }
if(!strcmp(argv[i], "-batch") && i + 1 < argc){ if (!strcmp(argv[i], "-batch") && i + 1 < argc) {
sentBatch = atoi(argv[i + 1]); sentBatch = atoi(argv[i + 1]);
fprintf(stderr, " -batch=%d\n", sentBatch); fprintf(stderr, " -batch=%d\n", sentBatch);
} }
if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){ if (!strcmp(argv[i], "-wbatch") && i + 1 < argc) {
wordBatch = atoi(argv[i + 1]); wordBatch = atoi(argv[i + 1]);
fprintf(stderr, " -wbatch=%d\n", wordBatch); fprintf(stderr, " -wbatch=%d\n", wordBatch);
} }
if(!strcmp(argv[i], "-shuffle")){ if (!strcmp(argv[i], "-shuffle")) {
shuffled = true; shuffled = true;
fprintf(stderr, " -shuffle=true\n"); fprintf(stderr, " -shuffle=true\n");
} }
if(!strcmp(argv[i], "-autodiff")){ if (!strcmp(argv[i], "-autodiff")) {
autoDiff = true; autoDiff = true;
fprintf(stderr, " -autodiff=true\n"); fprintf(stderr, " -autodiff=true\n");
} }
if(!strcmp(argv[i], "-dev") && i + 1 < argc){ if (!strcmp(argv[i], "-dev") && i + 1 < argc) {
model.devID = atoi(argv[i + 1]); model.devID = atoi(argv[i + 1]);
fprintf(stderr, " -dev=%d\n", model.devID); fprintf(stderr, " -dev=%d\n", model.devID);
} }
} }
for(int i = 0; i < argc; i++){ for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], "-mempool")) if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID); model.mem = new XMem(model.devID);
} }
} }
/* check model settings */ /* check model settings */
void Check(FNNModel &model) void Check(FNNModel &model)
{ {
CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!"); CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!"); CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!"); CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
} }
/* make a hard copy of the fnn model */ /* make a hard copy of the fnn model */
void Copy(FNNModel &tgt, FNNModel &src) void Copy(FNNModel &tgt, FNNModel &src)
{ {
InitTensorV2(&tgt.embeddingW, &src.embeddingW); InitTensorV2(&tgt.embeddingW, &src.embeddingW);
for(int i = 0; i < MAX_HIDDEN_NUM; i++){ for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]); InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]); InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
} }
InitTensorV2(&tgt.outputW, &src.outputW); InitTensorV2(&tgt.outputW, &src.outputW);
InitTensorV2(&tgt.outputB, &src.outputB); InitTensorV2(&tgt.outputB, &src.outputB);
tgt.n = src.n; tgt.n = src.n;
tgt.eSize = src.eSize; tgt.eSize = src.eSize;
tgt.hDepth = src.hDepth; tgt.hDepth = src.hDepth;
tgt.hSize = src.hSize; tgt.hSize = src.hSize;
tgt.vSize = src.vSize; tgt.vSize = src.vSize;
tgt.devID = src.devID; tgt.devID = src.devID;
tgt.useMemPool = src.useMemPool; tgt.useMemPool = src.useMemPool;
if(src.mem != NULL){ if (src.mem != NULL) {
tgt.mem = new XMem(src.mem->devID, src.mem->mode, tgt.mem = new XMem(src.mem->devID, src.mem->mode,
src.mem->maxBlockSize, src.mem->blockNum, src.mem->maxBlockSize, src.mem->blockNum,
src.mem->bufSize); src.mem->bufSize);
} }
} }
/* /*
reset model parameters reset model parameters
>> model - the model whose parameter (gradient) is set to 0 >> model - the model whose parameter (gradient) is set to 0
>> isNodeGrad - indicates whether the tensor node keeps the >> isNodeGrad - indicates whether the tensor node keeps the
gradient information gradient information
*/ */
void Clear(FNNModel &model, bool isNodeGrad) void Clear(FNNModel &model, bool isNodeGrad)
{ {
if (isNodeGrad) { if (isNodeGrad) {
if(model.embeddingW.grad != NULL) if (model.embeddingW.grad != NULL)
model.embeddingW.grad->SetZeroAll(); model.embeddingW.grad->SetZeroAll();
for (int i = 0; i < MAX_HIDDEN_NUM; i++) { for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
if(model.hiddenW[i].grad != NULL) if (model.hiddenW[i].grad != NULL)
model.hiddenW[i].grad->SetZeroAll(); model.hiddenW[i].grad->SetZeroAll();
if(model.hiddenB[i].grad != NULL) if (model.hiddenB[i].grad != NULL)
model.hiddenB[i].grad->SetZeroAll(); model.hiddenB[i].grad->SetZeroAll();
} }
if(model.outputW.grad != NULL) if (model.outputW.grad != NULL)
model.outputW.grad->SetZeroAll(); model.outputW.grad->SetZeroAll();
if(model.outputB.grad != NULL) if (model.outputB.grad != NULL)
model.outputB.grad->SetZeroAll(); model.outputB.grad->SetZeroAll();
} }
else { else {
model.embeddingW.SetZeroAll(); model.embeddingW.SetZeroAll();
for (int i = 0; i < MAX_HIDDEN_NUM; i++) { for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
model.hiddenW[i].SetZeroAll(); model.hiddenW[i].SetZeroAll();
model.hiddenB[i].SetZeroAll(); model.hiddenB[i].SetZeroAll();
} }
model.outputW.SetZeroAll(); model.outputW.SetZeroAll();
model.outputB.SetZeroAll(); model.outputB.SetZeroAll();
} }
} }
/* /*
initialize a 1d tensor using the fnn model setting initialize a 1d tensor using the fnn model setting
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
>> num - number of items >> num - number of items
>> model - the fnn model >> model - the fnn model
*/ */
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model) void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
{ {
InitTensor1DV2(&tensor, num, X_FLOAT, model.devID); InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
} }
/* /*
initialize a 2d tensor using the fnn model setting initialize a 2d tensor using the fnn model setting
>> tensor - the tensor to initialize >> tensor - the tensor to initialize
>> rowNum - number of rows >> rowNum - number of rows
>> colNum - number of columns >> colNum - number of columns
>> model - the fnn model >> model - the fnn model
*/ */
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model) void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
{ {
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
} }
/* initialize the model */ /* initialize the model */
void Init(FNNModel &model) void Init(FNNModel &model)
{ {
/* create embedding parameter matrix: vSize * eSize */ /* create embedding parameter matrix: vSize * eSize */
InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model); InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
model.embeddingW.SetVarFlag();
/* create hidden layer parameter matrics */ /* create hidden layer parameter matrics */
for(int i = 0; i < model.hDepth; i++){ for (int i = 0; i < model.hDepth; i++) {
/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer /* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
hsize * hsize otherwise */ hsize * hsize otherwise */
if(i == 0) if (i == 0)
InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model); InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
else else
InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model); InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model);
model.hiddenW[i].SetVarFlag();
/* bias term: a row vector of hSize entries */ /* bias term: a row vector of hSize entries */
InitModelTensor1D(model.hiddenB[i], model.hSize, model); InitModelTensor1D(model.hiddenB[i], model.hSize, model);
} model.hiddenB[i].SetVarFlag();
}
/* create the output layer parameter matrix and bias term */
int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize; /* create the output layer parameter matrix and bias term */
InitModelTensor2D(model.outputW, iSize, model.vSize, model); int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
InitModelTensor1D(model.outputB, model.vSize, model); InitModelTensor2D(model.outputW, iSize, model.vSize, model);
InitModelTensor1D(model.outputB, model.vSize, model);
/* then, we initialize model parameters using a uniform distribution in range model.outputW.SetVarFlag();
of [-minmax, minmax] */ model.outputB.SetVarFlag();
model.embeddingW.SetDataRand(-minmax, minmax); /* then, we initialize model parameters using a uniform distribution in range
model.outputW.SetDataRand(-minmax, minmax); of [-minmax, minmax] */
for(int i = 0; i < model.hDepth; i++) model.embeddingW.SetDataRand(-minmax, minmax);
model.hiddenW[i].SetDataRand(-minmax, minmax); model.outputW.SetDataRand(-minmax, minmax);
for (int i = 0; i < model.hDepth; i++)
/* all bias terms are set to zero */ model.hiddenW[i].SetDataRand(-minmax, minmax);
model.outputB.SetZeroAll();
for(int i = 0; i < model.hDepth; i++) /* all bias terms are set to zero */
model.hiddenB[i].SetZeroAll(); model.outputB.SetZeroAll();
} for (int i = 0; i < model.hDepth; i++)
model.hiddenB[i].SetZeroAll();
/* }
shuffle lines of the file
>> srcFile - the source file to shuffle /*
>> tgtFile - the resulting file shuffle lines of the file
*/ >> srcFile - the source file to shuffle
void Shuffle(const char * srcFile, const char * tgtFile) >> tgtFile - the resulting file
{ */
char * line = new char[MAX_LINE_LENGTH_HERE]; void Shuffle(const char * srcFile, const char * tgtFile)
{
char * line = new char[MAX_LINE_LENGTH_HERE];
#ifndef WIN32 #ifndef WIN32
sprintf(line, "shuf %s > %s", srcFile, tgtFile); sprintf(line, "shuf %s > %s", srcFile, tgtFile);
system(line); system(line);
#else #else
ShowErrors("Cannot shuffle the file on WINDOWS systems!"); ShowErrors("Cannot shuffle the file on WINDOWS systems!");
#endif #endif
delete[] line; delete[] line;
}
char lineBuf[MAX_LINE_LENGTH_HERE];
int wordBuf[MAX_LINE_LENGTH_HERE];
/*
train the model with the standard SGD method
>> train - training data file
>> isShuffled - shuffle the data file or not
>> model - the fnn model
*/
void Train(const char * train, bool isShuffled, FNNModel &model)
{
char name[MAX_NAME_LENGTH];
/* shuffle the data */
if(isShuffled){
sprintf(name, "%s-tmp", train);
Shuffle(train, name);
}
else
strcpy(name, train);
int epoch = 0;
int step = 0;
int wordCount = 0;
int wordCountTotal = 0;
int ngramNum = 1;
float loss = 0;
bool isEnd = false;
NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
/* make a model to keep gradients */
FNNModel grad;
Copy(grad, model);
/* XNet for automatic differentiation */
XNet autoDiffer;
double startT = GetClockSec();
/* iterate for a number of epochs */
for(epoch = 0; epoch < nEpoch; epoch++){
/* data file */
FILE * file = fopen(name, "rb");
CheckErrors(file, "Cannot open the training file");
wordCount = 0;
loss = 0;
ngramNum = 1;
while(ngramNum > 0){
/* load a minibatch of ngrams */
ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);
if (ngramNum <= 0)
break;
/* previous n - 1 words */
XTensor inputs[MAX_N_GRAM];
/* the predicted word */
XTensor output;
/* the gold standard */
XTensor gold;
/* the loss tensor */
XTensor lossTensor;
/* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
if(!autoDiff){
/* prepare an empty network for building the fnn */
FNNNet net;
/* gradident = 0 */
Clear(grad, false);
/* forward computation */
Forward(inputs, output, model, net);
/* backward computation to obtain gradients */
Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
/* update model parameters */
Update(model, grad, learningRate, false);
}
else{
/* gradient = 0 */
Clear(model, true);
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */
autoDiffer.Backward(lossTensor);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update(model, grad, learningRate, true);
}
/* get probabilities */
float prob = GetProb(output, gold);
prob = ReduceSumAll(lossTensor);
loss += prob;
wordCount += ngramNum;
wordCountTotal += ngramNum;
if(++step >= nStep){
isEnd = true;
break;
}
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
}
}
fclose(file);
if(isEnd)
break;
Test(testFN, outputFN, model);
}
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch);
delete[] ngrams;
}
/*
update the model parameters using the delta rule
>> model - the model to update
>> grad - gradients
>> epsilon - learning rate
>> isNodeGrad - indicates whether the gradient is associated with the node
*/
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
{
TensorList paraList(10);
TensorList gradList(10);
paraList.Add(&model.outputW);
paraList.Add(&model.outputB);
for (int i = 0; i < model.hDepth; i++) {
paraList.Add(&model.hiddenW[i]);
paraList.Add(&model.hiddenB[i]);
}
paraList.Add(&model.embeddingW);
if(!isNodeGrad){
gradList.Add(&grad.outputW);
gradList.Add(&grad.outputB);
for (int i = 0; i < model.hDepth; i++) {
gradList.Add(&grad.hiddenW[i]);
gradList.Add(&grad.hiddenB[i]);
}
;
gradList.Add(&grad.embeddingW);
}
else{
gradList.Add(model.outputW.grad);
gradList.Add(model.outputB.grad);
for (int i = 0; i < model.hDepth; i++) {
gradList.Add(model.hiddenW[i].grad);
gradList.Add(model.hiddenB[i].grad);
}
gradList.Add(model.embeddingW.grad);
}
for (int i = 0; i < paraList.count; i++) {
XTensor * para = (XTensor*)paraList.GetItem(i);
XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
//fprintf(stderr, "%d\n", i);
//paraGrad->Dump(stderr, "grad:", 10);
/* the delta rule */
_Sum(para, paraGrad, para, -epsilon);
}
}
/*
get prediction probabilites of the gold words
>> output - output probabilities
>> gold - gold standard
>> wordPobs - probability of each word
<< return - probability of the batch
*/
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
{
XTensor probs;
InitTensorV2(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&output, &gold, &probs);
/* probability of each word */
XTensor wprobs;
InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
int dims[2];
dims[0] = 1;
dims[1] = probs.unitNum;
probs.Reshape(2, dims);
/* probability for the batch */
XTensor result;
InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
}
int pin = 0;
int wordBufCount = 0;
/*
load a minibatch of ngrams
>> file - data file
>> n - order of the language model
>> ngrams - the loaded ngrams
>> sentNum - maximum sentences kept in the minibatch
>> wordNum - maximum words kept in the minibatch
*/
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
{
int num = 0;
int lineNum = 0;
while(pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)){
if(pin <= 0){
int len = (int)strlen(lineBuf);
while(lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n'){
lineBuf[len - 1] = 0;
len--;
}
len = (int)strlen(lineBuf);
if(len == 0)
continue;
/* how many characters are in a word */
int wSize = 0;
/* how many words are in the sentence */
int wNum = 0;
int i = 0;
for(i = pin; i < len; i++){
/* load word (id) seperated by space or tab */
if((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0){
lineBuf[i] = 0;
wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wSize = 0;
}
else
wSize++;
}
if(wSize > 0)
wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wordBufCount = wNum;
lineNum++;
}
else
lineNum = 1;
int i = -MAX_INT;
/* create ngrams */
for(i = MAX(pin, n - 1); i < wordBufCount - 1; i++){
memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
if(num >= wordNum)
break;
}
/* set a finished flag if we reach the end of the sentence*/
if(i >= wordBufCount - 1){
pin = 0;
wordBufCount = 0;
}
/* record where to start next time if we break in the middle */
else{
pin = i + 1;
}
if((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
break;
}
return num;
}
/*
make a 2d tensor in zero-one representation
The indexed cell is set to 1, and 0 otherwise.
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> rows - row index
>> cols - column index
>> itemNum - number of non-zero items
>> devID - device id
>> mem - memory pool
*/
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
tensor.SetZeroAll();
/* set none-zero cells */
for(int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]);
}
/*
make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words
>> ngrams - the ngram batch
>> ngramNum - batch size
>> n - indicate which word is encode for each ngram
>> vSize - vocabulary size
>> devID - device id
>> mem - memory pool
*/
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
{
int * rows = new int[ngramNum];
int * cols = new int[ngramNum];
for(int i = 0; i < ngramNum; i++){
rows[i] = i;
cols[i] = ngrams[i].words[n];
}
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
delete[] rows;
delete[] cols;
}
/*
forward procedure
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
>> net - the network that keeps the internal tensors generated in the process
*/
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
{
int batchSize = -1;
int n = model.n;
int depth = model.hDepth;
TensorList eList(n - 1);
/* previoius n - 1 words */
for(int i = 0; i < n - 1; i++){
XTensor &input = inputs[i];
XTensor &w = model.embeddingW;
XTensor &embedding = net.embeddings[i];
if(batchSize == -1)
batchSize = input.dimSize[0];
else{
CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
}
/* embedding output tensor of position i */
InitModelTensor2D(embedding, batchSize, model.eSize, model);
/* generate word embedding of position i:
embedding = input * w */
_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
eList.Add(&net.embeddings[i]);
}
/* concatenate word embeddings
embeddingcat = cat(embedding_0...embedding_{n-1}) */
InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
_Concatenate(&eList, &net.embeddingCat, 1);
/* go over each hidden layer */
for(int i = 0; i < depth; i++){
XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
XTensor &w = model.hiddenW[i];
XTensor &b = model.hiddenB[i];
XTensor &h = net.hiddens[i];
XTensor &s = net.hiddenStates[i];
InitModelTensor2D(h, batchSize, model.hSize, model);
InitModelTensor2D(s, batchSize, model.hSize, model);
/* generate hidden states of layer i:
s = h_pre * w */
_MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
/* make a 2d tensor for the bias term */
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
/* introduce bias term:
s = s + b
NOTE: the trick here is to extend b to a 2d tensor
to fit into the 2d representation in tensor summation */
_Sum(&s, &b2D, &s);
/* pass the state through the hard tanh function:
h = tanh(s) */
_HardTanH(&s, &h);
}
/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
y = softmax(h_last * w)
Note that this is the implementation as that in Bengio et al.' paper.
TODO: we add bias term here */
{
XTensor &h_last = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
XTensor &w = model.outputW;
XTensor &b = model.outputB;
XTensor &s = net.stateLast;
XTensor &y = output;
InitModelTensor2D(s, batchSize, model.vSize, model);
InitModelTensor2D(y, batchSize, model.vSize, model);
/* s = h_last * w */
_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s);
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
}
}
/*
backward procedure
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
>> grad - the model that keeps the gradient information
>> net - the network that keeps the internal tensors generated in the process
*/
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net)
{
int batchSize = output.GetDim(0);
int n = model.n;
int depth = model.hDepth;
/* back-propagation for the output layer */
XTensor &y = output;
XTensor &s = net.stateLast;
XTensor &x = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
XTensor &w = model.outputW;
XTensor &dedw = grad.outputW;
XTensor &dedb = grad.outputB;
XTensor deds(&y);
XTensor dedx(&x);
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
= dE/ds_j * x_{i}
(where i and j are the row and column indices, and
x is the top most hidden layer)
so we know
dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
/* then, we compute
dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
= \sum_j' (dE/ds_{j'} * w_{j, j'})
i.e.,
dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
XTensor &gradPassed = dedx;
XTensor dedsHidden;
XTensor dedxBottom;
if (depth > 0)
InitTensorV2(&dedsHidden, &dedx);
InitTensorV2(&dedxBottom, &net.embeddingCat);
/* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s)
s = x * w + b */
for (int i = depth - 1; i >= 0; i--) {
XTensor &h = net.hiddens[i];
XTensor &s = net.hiddenStates[i];
XTensor &x = i == 0 ? net.embeddingCat : net.hiddenStates[i - 1];
XTensor &w = model.hiddenW[i];
XTensor &dedh = gradPassed; // gradient passed though the previous layer
XTensor &dedx = i == 0 ? dedxBottom : dedh;
XTensor &deds = dedsHidden;
XTensor &dedw = grad.hiddenW[i];
XTensor &dedb = grad.hiddenB[i];
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
_HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
/* gradient of the input: dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
if (i > 0)
_CopyValues(&dedx, &gradPassed);
}
TensorList eList(n - 1);
/* back-propagation for the embedding layer */
for (int i = 0; i < n - 1; i++) {
XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
eList.Add(dedy);
}
/* gradient of the concatenation of the embedding layers */
XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
/* split the concatenation of gradients of the embeddings */
_Split(&dedyCat, &eList, 1, n - 1);
/* go over for each word */
for (int i = 0; i < n - 1; i++) {
XTensor * dedy = (XTensor*)eList.GetItem(i);
XTensor &x = inputs[i];
XTensor &dedw = grad.embeddingW;
/* gradient of the embedding weight: dE/dw += x^T * dE/dy
NOTE that we accumulate dE/dw here because the matrix w
is shared by several layers (or words) */
_MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
delete dedy;
}
}
/*
forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words
>> output - output probability
>> model - the fnn model
*/
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
int size = batch * (n-1);
int * index = new int[size];
for(int i = 0; i < batch; i++){
for (int j = 0; j < n-1; j++){
int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j];
}
}
InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words);
delete[] index; }
int dimSize[2]; char lineBuf[MAX_LINE_LENGTH_HERE];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1); int wordBuf[MAX_LINE_LENGTH_HERE];
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize); /*
train the model with the standard SGD method
>> train - training data file
>> isShuffled - shuffle the data file or not
>> model - the fnn model
*/
void Train(const char * train, bool isShuffled, FNNModel &model)
{
char name[MAX_NAME_LENGTH];
/* hidden layers */ /* shuffle the data */
for(int i = 0; i < depth; i++) if (isShuffled) {
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); sprintf(name, "%s-tmp", train);
Shuffle(train, name);
}
else
strcpy(name, train);
/* output layer */ int epoch = 0;
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); int step = 0;
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1); int wordCount = 0;
} int wordCountTotal = 0;
int ngramNum = 1;
float loss = 0;
bool isEnd = false;
NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
/* make a model to keep gradients */
FNNModel grad;
Copy(grad, model);
/* XNet for automatic differentiation */
XNet autoDiffer;
double startT = GetClockSec();
/* iterate for a number of epochs */
for (epoch = 0; epoch < nEpoch; epoch++) {
/* data file */
FILE * file = fopen(name, "rb");
CheckErrors(file, "Cannot open the training file");
wordCount = 0;
loss = 0;
ngramNum = 1;
/* while (ngramNum > 0) {
forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations /* load a minibatch of ngrams */
>> output - output probability ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);
>> model - the fnn model
*/
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
TensorList inputList(n - 1);
for(int i = 0; i < n - 1; i++)
inputList.Add(inputs + i);
/* represent n - 1 words in one tensor */
words = Merge(inputList, 0);
/* word embedding */
embeddingBig = MMul(words, model.embeddingW);
/* input of the first hidden layer */
hidden = Split(embeddingBig, 0, n - 1);
hidden = Merge(hidden, 2, 0);
/* hidden layers */
for(int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
/* output layer */ if (ngramNum <= 0)
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); break;
} /* previous n - 1 words */
XTensor inputs[MAX_N_GRAM];
/*
dump the model to the disk space /* the predicted word */
>> fn - where to keep the model XTensor output;
>> model - the fnn model
*/ /* the gold standard */
void Dump(const char * fn, FNNModel &model) XTensor gold;
{
FILE * file = fopen(fn, "wb"); /* the loss tensor */
CheckErrors(file, "Cannot open the model file"); XTensor lossTensor;
model.embeddingW.Dump(file, "embedding w:"); /* make the input tensor for position i */
for (int i = 0; i < model.hDepth; i++) { for (int i = 0; i < model.n - 1; i++)
char name[MAX_NAME_LENGTH]; MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
sprintf(name, "hidden %d w:", i);
model.hiddenW[i].Dump(file, name); /* make the gold tensor */
sprintf(name, "hidden %d b:", i); MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
model.hiddenB[i].Dump(file, name);
} if (!autoDiff) {
/* prepare an empty network for building the fnn */
model.outputW.Dump(file, "output w:"); FNNNet net;
model.outputB.Dump(file, "output b:");
/* gradident = 0 */
fclose(file); Clear(grad, false);
XPRINT(0, stderr, "[INFO] model saved\n"); /* forward computation */
} Forward(inputs, output, model, net);
/* /* backward computation to obtain gradients */
read the model from the disk space Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
>> fn - where to keep the model
>> model - the fnn model /* update model parameters */
*/ Update(model, grad, learningRate, false);
void Read(const char * fn, FNNModel &model) }
{ else {
FILE * file = fopen(fn, "rb"); /* gradient = 0 */
CheckErrors(file, "Cannot open the model file"); Clear(model, true);
model.embeddingW.Read(file, "embedding w:"); /* forward + backward process */
for (int i = 0; i < model.hDepth; i++) {
char name[MAX_NAME_LENGTH]; /* this is implemented by gather function */
sprintf(name, "hidden %d w:", i); ForwardAutoDiff(ngrams, ngramNum, output, model);
model.hiddenW[i].Read(file, name);
sprintf(name, "hidden %d b:", i); /* this is implemented by multiply function */
model.hiddenB[i].Read(file, name); //ForwardAutoDiff(inputs, output, model);
} lossTensor = CrossEntropy(output, gold);
output.Dump(stderr, "output:",10);
model.outputW.Read(file, "output w:"); gold.Dump(stderr, "gold:", 10);
model.outputB.Read(file, "output b:"); lossTensor.Dump(stderr, "lossTensor:",10);
fclose(file); /* automatic differentiation */
autoDiffer.Backward(lossTensor);
XPRINT(0, stderr, "[INFO] model loaded\n"); //autoDiffer.Backward(output, gold, CROSSENTROPY);
}
/* update model parameters */
/* Update(model, grad, learningRate, true);
test the model }
>> test - test data file
>> result - where to keep the result /* get probabilities */
>> model - the fnn model float prob = GetProb(output, gold);
*/ if (autoDiff) {
void Test(const char * test, const char * result, FNNModel &model) prob = -ReduceSumAll(lossTensor);
{ }
int wordCount = 0; //printf("prob:%f", prob);
int sentCount = 0; loss += -prob;
float loss = 0; wordCount += ngramNum;
wordCountTotal += ngramNum;
NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
if (++step >= nStep) {
double startT = GetClockSec(); isEnd = true;
break;
/* data files */ }
FILE * file = fopen(test, "rb");
CheckErrors(file, "Cannot read the test file"); if (step % 100 == 0) {
FILE * ofile = fopen(result, "wb"); double elapsed = GetClockSec() - startT;
CheckErrors(ofile, "Cannot open the output file"); XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
int ngramNum = 1; }
while (ngramNum > 0) { }
/* load a minibatch of ngrams */ fclose(file);
ngramNum = LoadNGrams(file, model.n, ngrams, 1, MAX_INT);
if (isEnd)
if (ngramNum <= 0) break;
break;
Test(testFN, outputFN, model);
/* previous n - 1 words */ }
XTensor inputs[MAX_N_GRAM];
double elapsed = GetClockSec() - startT;
/* the predicted word */
XTensor output; XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
/* the gold standard */ XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
XTensor gold; elapsed, step, epoch);
/* make the input tensor for position i */ delete[] ngrams;
for (int i = 0; i < model.n - 1; i++) }
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
/*
/* make the gold tensor */ update the model parameters using the delta rule
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem); >> model - the model to update
>> grad - gradients
if (!autoDiff) { >> epsilon - learning rate
/* prepare an empty network for building the fnn */ >> isNodeGrad - indicates whether the gradient is associated with the node
FNNNet net; */
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
/* forward computation */ {
Forward(inputs, output, model, net); TensorList paraList(10);
} TensorList gradList(10);
else {
/* this is implemented by gather function */ paraList.Add(&model.outputW);
ForwardAutoDiff(ngrams, ngramNum, output, model); paraList.Add(&model.outputB);
/* this is implemented by multiply function */ for (int i = 0; i < model.hDepth; i++) {
//ForwardAutoDiff(inputs, output, model); paraList.Add(&model.hiddenW[i]);
} paraList.Add(&model.hiddenB[i]);
}
/* prediction probabilities */
XTensor probs; paraList.Add(&model.embeddingW);
InitTensor1DV2(&probs, ngramNum);
if (!isNodeGrad) {
/* get probabilities */ gradList.Add(&grad.outputW);
float prob = GetProb(output, gold, &probs); gradList.Add(&grad.outputB);
/* dump the test result */ for (int i = 0; i < model.hDepth; i++) {
for (int i = 0; i < model.n - 1; i++) gradList.Add(&grad.hiddenW[i]);
fprintf(ofile, "%d ", ngrams[0].words[i]); gradList.Add(&grad.hiddenB[i]);
for (int i = 0; i < ngramNum; i++) }
fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]); ;
fprintf(ofile, "||| "); gradList.Add(&grad.embeddingW);
for (int i = 0; i < model.n - 1; i++) }
fprintf(ofile, "<s> "); else {
for (int i = 0; i < ngramNum; i++) gradList.Add(model.outputW.grad);
fprintf(ofile, "%f ", probs.Get1D(i)); gradList.Add(model.outputB.grad);
fprintf(ofile, "||| %f\n", prob);
for (int i = 0; i < model.hDepth; i++) {
loss += -prob; gradList.Add(model.hiddenW[i].grad);
wordCount += ngramNum; gradList.Add(model.hiddenB[i].grad);
sentCount += 1; }
}
gradList.Add(model.embeddingW.grad);
fclose(file); }
double elapsed = GetClockSec() - startT; for (int i = 0; i < paraList.count; i++) {
XTensor * para = (XTensor*)paraList.GetItem(i);
XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss/wordCount)); XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n",
elapsed, sentCount, wordCount); //fprintf(stderr, "%d\n", i);
//paraGrad->Dump(stderr, "grad:", 10);
delete[] ngrams;
} /* the delta rule */
_Sum(para, paraGrad, para, -epsilon);
}
}
/*
get prediction probabilites of the gold words
>> output - output probabilities
>> gold - gold standard
>> wordPobs - probability of each word
<< return - probability of the batch
*/
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
{
XTensor probs;
InitTensorV2(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&output, &gold, &probs);
/* probability of each word */
XTensor wprobs;
InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1);
if (wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
int dims[2];
dims[0] = 1;
dims[1] = probs.unitNum;
probs.Reshape(2, dims);
/* probability for the batch */
XTensor result;
InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
}
int pin = 0;
int wordBufCount = 0;
/*
load a minibatch of ngrams
>> file - data file
>> n - order of the language model
>> ngrams - the loaded ngrams
>> sentNum - maximum sentences kept in the minibatch
>> wordNum - maximum words kept in the minibatch
*/
int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
{
int num = 0;
int lineNum = 0;
while (pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)) {
if (pin <= 0) {
int len = (int)strlen(lineBuf);
while (lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n') {
lineBuf[len - 1] = 0;
len--;
}
len = (int)strlen(lineBuf);
if (len == 0)
continue;
/* how many characters are in a word */
int wSize = 0;
/* how many words are in the sentence */
int wNum = 0;
int i = 0;
for (i = pin; i < len; i++) {
/* load word (id) seperated by space or tab */
if ((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0) {
lineBuf[i] = 0;
wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wSize = 0;
}
else
wSize++;
}
if (wSize > 0)
wordBuf[wNum++] = atoi(lineBuf + i - wSize);
wordBufCount = wNum;
lineNum++;
}
else
lineNum = 1;
int i = -MAX_INT;
/* create ngrams */
for (i = MAX(pin, n - 1); i < wordBufCount - 1; i++) {
memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
if (num >= wordNum)
break;
}
/* set a finished flag if we reach the end of the sentence*/
if (i >= wordBufCount - 1) {
pin = 0;
wordBufCount = 0;
}
/* record where to start next time if we break in the middle */
else {
pin = i + 1;
}
if ((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
break;
}
return num;
}
/*
make a 2d tensor in zero-one representation
The indexed cell is set to 1, and 0 otherwise.
>> tensor - the tensor to initialize
>> rowNum - number of rows
>> colNum - number of columns
>> rows - row index
>> cols - column index
>> itemNum - number of non-zero items
>> devID - device id
>> mem - memory pool
*/
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
tensor.SetZeroAll();
/* set none-zero cells */
for (int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]);
}
/*
make a tensor that encodes a batch of words
>> batch - the tensor encoding a batch of words
>> ngrams - the ngram batch
>> ngramNum - batch size
>> n - indicate which word is encode for each ngram
>> vSize - vocabulary size
>> devID - device id
>> mem - memory pool
*/
void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
{
int * rows = new int[ngramNum];
int * cols = new int[ngramNum];
for (int i = 0; i < ngramNum; i++) {
rows[i] = i;
cols[i] = ngrams[i].words[n];
}
InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
delete[] rows;
delete[] cols;
}
/*
forward procedure
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
>> net - the network that keeps the internal tensors generated in the process
*/
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
{
int batchSize = -1;
int n = model.n;
int depth = model.hDepth;
TensorList eList(n - 1);
/* previoius n - 1 words */
for (int i = 0; i < n - 1; i++) {
XTensor &input = inputs[i];
XTensor &w = model.embeddingW;
XTensor &embedding = net.embeddings[i];
if (batchSize == -1)
batchSize = input.dimSize[0];
else {
CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
}
/* embedding output tensor of position i */
InitModelTensor2D(embedding, batchSize, model.eSize, model);
/* generate word embedding of position i:
embedding = input * w */
_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
eList.Add(&net.embeddings[i]);
}
/* concatenate word embeddings
embeddingcat = cat(embedding_0...embedding_{n-1}) */
InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
_Concatenate(&eList, &net.embeddingCat, 1);
/* go over each hidden layer */
for (int i = 0; i < depth; i++) {
XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
XTensor &w = model.hiddenW[i];
XTensor &b = model.hiddenB[i];
XTensor &h = net.hiddens[i];
XTensor &s = net.hiddenStates[i];
InitModelTensor2D(h, batchSize, model.hSize, model);
InitModelTensor2D(s, batchSize, model.hSize, model);
/* generate hidden states of layer i:
s = h_pre * w */
_MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
/* make a 2d tensor for the bias term */
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
/* introduce bias term:
s = s + b
NOTE: the trick here is to extend b to a 2d tensor
to fit into the 2d representation in tensor summation */
_Sum(&s, &b2D, &s);
/* pass the state through the hard tanh function:
h = tanh(s) */
_HardTanH(&s, &h);
}
/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
y = softmax(h_last * w)
Note that this is the implementation as that in Bengio et al.' paper.
TODO: we add bias term here */
{
XTensor &h_last = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
XTensor &w = model.outputW;
XTensor &b = model.outputB;
XTensor &s = net.stateLast;
XTensor &y = output;
InitModelTensor2D(s, batchSize, model.vSize, model);
InitModelTensor2D(y, batchSize, model.vSize, model);
/* s = h_last * w */
_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s);
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
}
}
/*
backward procedure
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
>> grad - the model that keeps the gradient information
>> net - the network that keeps the internal tensors generated in the process
*/
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net)
{
int batchSize = output.GetDim(0);
int n = model.n;
int depth = model.hDepth;
/* back-propagation for the output layer */
XTensor &y = output;
XTensor &s = net.stateLast;
XTensor &x = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
XTensor &w = model.outputW;
XTensor &dedw = grad.outputW;
XTensor &dedb = grad.outputB;
XTensor deds(&y);
XTensor dedx(&x);
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
= dE/ds_j * x_{i}
(where i and j are the row and column indices, and
x is the top most hidden layer)
so we know
dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
/* then, we compute
dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
= \sum_j' (dE/ds_{j'} * w_{j, j'})
i.e.,
dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
XTensor &gradPassed = dedx;
XTensor dedsHidden;
XTensor dedxBottom;
if (depth > 0)
InitTensorV2(&dedsHidden, &dedx);
InitTensorV2(&dedxBottom, &net.embeddingCat);
/* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s)
s = x * w + b */
for (int i = depth - 1; i >= 0; i--) {
XTensor &h = net.hiddens[i];
XTensor &s = net.hiddenStates[i];
XTensor &x = i == 0 ? net.embeddingCat : net.hiddenStates[i - 1];
XTensor &w = model.hiddenW[i];
XTensor &dedh = gradPassed; // gradient passed though the previous layer
XTensor &dedx = i == 0 ? dedxBottom : dedh;
XTensor &deds = dedsHidden;
XTensor &dedw = grad.hiddenW[i];
XTensor &dedb = grad.hiddenB[i];
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
_HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
/* gradient of the input: dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
if (i > 0)
_CopyValues(&dedx, &gradPassed);
}
TensorList eList(n - 1);
/* back-propagation for the embedding layer */
for (int i = 0; i < n - 1; i++) {
XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
eList.Add(dedy);
}
/* gradient of the concatenation of the embedding layers */
XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
/* split the concatenation of gradients of the embeddings */
_Split(&dedyCat, &eList, 1, n - 1);
/* go over for each word */
for (int i = 0; i < n - 1; i++) {
XTensor * dedy = (XTensor*)eList.GetItem(i);
XTensor &x = inputs[i];
XTensor &dedw = grad.embeddingW;
/* gradient of the embedding weight: dE/dw += x^T * dE/dy
NOTE that we accumulate dE/dw here because the matrix w
is shared by several layers (or words) */
_MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
delete dedy;
}
}
/*
forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words
>> output - output probability
>> model - the fnn model
*/
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
int size = batch * (n - 1);
int * index = new int[size];
for (int i = 0; i < batch; i++) {
for (int j = 0; j < n - 1; j++) {
int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j];
}
}
InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size);
words.Dump(stderr, "word:", 10);
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
int dimSize[2];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
embeddingBig.Dump(stderr, "embeddingBig:", 10);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
hidden.Dump(stderr, "hidden-0:", 10);
/* hidden layers */
for (int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
hidden.Dump(stderr, "hidden-1:", 10);
/* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/*
forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
*/
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
TensorList inputList(n - 1);
for (int i = 0; i < n - 1; i++)
inputList.Add(inputs + i);
/* represent n - 1 words in one tensor */
words = Merge(inputList, 0);
/* word embedding */
embeddingBig = MMul(words, model.embeddingW);
/* input of the first hidden layer */
hidden = Split(embeddingBig, 0, n - 1);
hidden = Merge(hidden, 2, 0);
/* hidden layers */
for (int i = 0; i < depth; i++)
hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/*
dump the model to the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void Dump(const char * fn, FNNModel &model)
{
FILE * file = fopen(fn, "wb");
CheckErrors(file, "Cannot open the model file");
model.embeddingW.Dump(file, "embedding w:");
for (int i = 0; i < model.hDepth; i++) {
char name[MAX_NAME_LENGTH];
sprintf(name, "hidden %d w:", i);
model.hiddenW[i].Dump(file, name);
sprintf(name, "hidden %d b:", i);
model.hiddenB[i].Dump(file, name);
}
model.outputW.Dump(file, "output w:");
model.outputB.Dump(file, "output b:");
fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n");
}
/*
read the model from the disk space
>> fn - where to keep the model
>> model - the fnn model
*/
void Read(const char * fn, FNNModel &model)
{
FILE * file = fopen(fn, "rb");
CheckErrors(file, "Cannot open the model file");
model.embeddingW.Read(file, "embedding w:");
for (int i = 0; i < model.hDepth; i++) {
char name[MAX_NAME_LENGTH];
sprintf(name, "hidden %d w:", i);
model.hiddenW[i].Read(file, name);
sprintf(name, "hidden %d b:", i);
model.hiddenB[i].Read(file, name);
}
model.outputW.Read(file, "output w:");
model.outputB.Read(file, "output b:");
fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n");
}
/*
test the model
>> test - test data file
>> result - where to keep the result
>> model - the fnn model
*/
void Test(const char * test, const char * result, FNNModel &model)
{
int wordCount = 0;
int sentCount = 0;
float loss = 0;
NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
double startT = GetClockSec();
/* data files */
FILE * file = fopen(test, "rb");
CheckErrors(file, "Cannot read the test file");
FILE * ofile = fopen(result, "wb");
CheckErrors(ofile, "Cannot open the output file");
int ngramNum = 1;
while (ngramNum > 0) {
/* load a minibatch of ngrams */
ngramNum = LoadNGrams(file, model.n, ngrams, 1, MAX_INT);
if (ngramNum <= 0)
break;
/* previous n - 1 words */
XTensor inputs[MAX_N_GRAM];
/* the predicted word */
XTensor output;
/* the gold standard */
XTensor gold;
/* make the input tensor for position i */
for (int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
if (!autoDiff) {
/* prepare an empty network for building the fnn */
FNNNet net;
/* forward computation */
Forward(inputs, output, model, net);
}
else {
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
output = Log(output);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
XTensor probs;
InitTensor1DV2(&probs, ngramNum);
/* get probabilities */
float prob = GetProb(output, gold, &probs);
/* dump the test result */
for (int i = 0; i < model.n - 1; i++)
fprintf(ofile, "%d ", ngrams[0].words[i]);
for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]);
fprintf(ofile, "||| ");
for (int i = 0; i < model.n - 1; i++)
fprintf(ofile, "<s> ");
for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%f ", probs.Get1D(i));
fprintf(ofile, "||| %f\n", prob);
loss += -prob;
wordCount += ngramNum;
sentCount += 1;
}
fclose(file);
double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n",
elapsed, sentCount, wordCount);
delete[] ngrams;
}
}; };
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include <time.h> #include <time.h>
#include "XTensor.h" #include "XTensor.h"
#include "XDevice.h" #include "XDevice.h"
#include "./test/Test.h" //#include "./test/Test.h"
#include "./core/CHeader.h" #include "./core/CHeader.h"
#include "./loss/CrossEntropy.h" #include "./loss/CrossEntropy.h"
...@@ -44,7 +44,7 @@ void LittleTest(); ...@@ -44,7 +44,7 @@ void LittleTest();
void T2TTest(); void T2TTest();
void T2TTest2(); void T2TTest2();
void PowerTest(); void PowerTest();
void Tests();
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
//PowerTest(); //PowerTest();
...@@ -63,7 +63,7 @@ int main( int argc, const char ** argv ) ...@@ -63,7 +63,7 @@ int main( int argc, const char ** argv )
//return 0; //return 0;
if(argc > 1 && !strcmp(argv[1], "-test")) if(argc > 1 && !strcmp(argv[1], "-test"))
Test(); Tests();
else{ else{
fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n"); fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
fprintf(stderr, "use of tensors. All you need is to ... \n\n"); fprintf(stderr, "use of tensors. All you need is to ... \n\n");
...@@ -75,219 +75,223 @@ int main( int argc, const char ** argv ) ...@@ -75,219 +75,223 @@ int main( int argc, const char ** argv )
return 0; return 0;
} }
void myRead(XTensor * tensor, const char * filename, const char * label) void Tests() {
{
FILE * file = fopen(filename, "rb");
if(file == NULL)
printf("%s\n", filename);
tensor->Read(file, label);
}
void myDump(XTensor * tensor, const char * filename, const char * label)
{
FILE * file = fopen(filename, "wb");
if(file == NULL)
printf("%s\n", filename);
tensor->Dump(file, label);
}
void PowerTest()
{
XTensor input;
XTensor output;
InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
myRead(&input, "1.txt", "");
_Power(&input, &output, 2);
output.Dump(stderr, "", 200);
}
void SmallTest()
{
XTensor a;
XTensor b;
XTensor c;
XTensor d;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 1, 1);
b = Sum(a, Multiply(a, a));
/* this is prohibited !!!!!!!!!!!!! */
//XTensor c = a * b + a;
//XTensor d = a + b + c.Lin(0.5F);
c = a * b + a;
d = a + b + c.Lin(0.5F);
XLink::CheckNetwork(&d);
//XLink::ShowNetwork(stderr, &d);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
c.Dump(stderr, "c:");
d.Dump(stderr, "d:");
}
void TransposeTest()
{
XTensor a;
XTensor b;
int I = 2;
int J = 3;
InitTensor4D(&a, 2, 3, 4, 5);
int * dims = new int[a.order];
memcpy(dims, a.dimSize, sizeof(int) * a.order);
dims[I] = a.dimSize[J];
dims[J] = a.dimSize[I];
InitTensor(&b, 4, dims);
a.SetZeroAll();
b.SetZeroAll();
float * data = new float[a.unitNum];
for(int i = 0; i < a.unitNum; i++)
data[i] = (float)i;
a.SetData(data, a.unitNum, 0);
_Transpose(&a, &b, I, J);
b.Dump(stderr, "b:");
delete[] data;
} }
void LittleTest() //void myRead(XTensor * tensor, const char * filename, const char * label)
{ //{
int a = 5000; // FILE * file = fopen(filename, "rb");
int b = 100000; // if(file == NULL)
int c = a*b; // printf("%s\n", filename);
printf("%d\n", c); // tensor->Read(file, label);
//}
exit(1); //
} //void myDump(XTensor * tensor, const char * filename, const char * label)
//{
void T2TTest() // FILE * file = fopen(filename, "wb");
{ // if(file == NULL)
XTensor * input; // printf("%s\n", filename);
XTensor * weight; // tensor->Dump(file, label);
XTensor * output; //}
XTensor * gold; //
XTensor * dedy; //void PowerTest()
XTensor * dedx; //{
XTensor * dedxTmp; // XTensor input;
XTensor * dedw; // XTensor output;
XTensor * padding; // InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
// InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
DTYPE loss; // myRead(&input, "1.txt", "");
//
int * dimSize = new int[2]; // _Power(&input, &output, 2);
dimSize[0] = 256; // output.Dump(stderr, "", 200);
dimSize[1] = 10001; //}
//
int * dimSize2 = new int[3]; //void SmallTest()
dimSize2[0] = 2; //{
dimSize2[1] = 31; // XTensor a;
dimSize2[2] = 256; // XTensor b;
// XTensor c;
int * dimSize3 = new int[3]; // XTensor d;
dimSize3[0] = 2; //
dimSize3[1] = 31; // InitTensor2D(&a, 2, 2);
dimSize3[2] = 10001; // InitTensor2D(&b, 2, 2);
// a.SetZeroAll();
int * dimSize4 = new int[2]; // b.SetZeroAll();
dimSize4[0] = 2; // a.Set2D(1.0F, 0, 0);
dimSize4[1] = 31; // a.Set2D(2.0F, 1, 1);
//
input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0); // b = Sum(a, Multiply(a, a));
weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0); //
dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0); // /* this is prohibited !!!!!!!!!!!!! */
gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // //XTensor c = a * b + a;
output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // //XTensor d = a + b + c.Lin(0.5F);
dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); //
dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // c = a * b + a;
dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0); // d = a + b + c.Lin(0.5F);
padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0); //
// XLink::CheckNetwork(&d);
//weight = NewTensor(2, dimSize); // //XLink::ShowNetwork(stderr, &d);
//dedw = NewTensor(2, dimSize); //
//input = NewTensor(3, dimSize2); // a.Dump(stderr, "a:");
//gold = NewTensor(3, dimSize3); // b.Dump(stderr, "b:");
//output = NewTensor(3, dimSize3); // c.Dump(stderr, "c:");
//dedy = NewTensor(3, dimSize3); // d.Dump(stderr, "d:");
//dedx = NewTensor(3, dimSize3); //}
//dedxTmp = NewTensor(3, dimSize3); //
//padding = NewTensor(2, dimSize4); //void TransposeTest()
//{
myRead(input, "x.txt", "x"); // XTensor a;
myRead(weight, "w.txt", "w"); // XTensor b;
myRead(gold, "gold.txt", "gold"); //
myRead(padding, "padding.txt", "padding"); // int I = 2;
// int J = 3;
XTensor inter; //
inter = MMul(*input, *weight); // InitTensor4D(&a, 2, 3, 4, 5);
//
_Softmax(&inter, output, 2); // int * dims = new int[a.order];
// memcpy(dims, a.dimSize, sizeof(int) * a.order);
//_LogMe(output); // dims[I] = a.dimSize[J];
loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding); // dims[J] = a.dimSize[I];
//
printf("loss: %f\n", loss); // InitTensor(&b, 4, dims);
//
_CrossEntropyBackward(dedy, output, gold, NULL); // a.SetZeroAll();
//_CrossEntropyBackward(dedy, output, gold, NULL, padding); // b.SetZeroAll();
//
myDump(dedy, "dedy.txt", "dedy"); // float * data = new float[a.unitNum];
// for(int i = 0; i < a.unitNum; i++)
_SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS); // data[i] = (float)i;
_Sub(output, gold, dedxTmp); //
// a.SetData(data, a.unitNum, 0);
myDump(dedx, "dedx.txt", "dedx"); //
dedx->Dump(stderr, "dedx", 200); // _Transpose(&a, &b, I, J);
dedxTmp->Dump(stderr, "dedxTmp", 200); // b.Dump(stderr, "b:");
//
input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1)); // delete[] data;
dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1)); //}
//
_MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw); //void LittleTest()
//{
myDump(dedw, "dedw.txt", "dedw"); // int a = 5000;
} // int b = 100000;
// int c = a*b;
void T2TTest2() // printf("%d\n", c);
{ //
int dimSize[3]; // exit(1);
dimSize[0] = 161; //}
dimSize[1] = 47; //
dimSize[2] = 10001; //void T2TTest()
XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0); //{
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1); // XTensor * input;
// XTensor * weight;
//myRead(probs, "probs.txt", " "); // XTensor * output;
_SetDataFixedFloat(probs, 1.0F); // XTensor * gold;
// XTensor * dedy;
probs->Reshape(1, probs->unitNum); // XTensor * dedx;
// XTensor * dedxTmp;
DTYPE sum = _ReduceSumAll(probs); // XTensor * dedw;
printf("%e\n", sum); // XTensor * padding;
//
//XTensor tmp; // DTYPE loss;
//tmp = IsNonZero(*probs); //
//DTYPE nonZeroNum = ReduceSumAll(tmp); // int * dimSize = new int[2];
//printf("%f\n", nonZeroNum); // dimSize[0] = 256;
// // dimSize[1] = 10001;
//DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0); //
// int * dimSize2 = new int[3];
//printf("%e\n", gpu); // dimSize2[0] = 2;
} // dimSize2[1] = 31;
// dimSize2[2] = 256;
//
// int * dimSize3 = new int[3];
// dimSize3[0] = 2;
// dimSize3[1] = 31;
// dimSize3[2] = 10001;
//
// int * dimSize4 = new int[2];
// dimSize4[0] = 2;
// dimSize4[1] = 31;
//
// input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
// weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
// gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
// padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
//
// //weight = NewTensor(2, dimSize);
// //dedw = NewTensor(2, dimSize);
// //input = NewTensor(3, dimSize2);
// //gold = NewTensor(3, dimSize3);
// //output = NewTensor(3, dimSize3);
// //dedy = NewTensor(3, dimSize3);
// //dedx = NewTensor(3, dimSize3);
// //dedxTmp = NewTensor(3, dimSize3);
// //padding = NewTensor(2, dimSize4);
//
// myRead(input, "x.txt", "x");
// myRead(weight, "w.txt", "w");
// myRead(gold, "gold.txt", "gold");
// myRead(padding, "padding.txt", "padding");
//
// XTensor inter;
// inter = MMul(*input, *weight);
//
// _Softmax(&inter, output, 2);
//
// //_LogMe(output);
// loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
//
// printf("loss: %f\n", loss);
//
// _CrossEntropyBackward(dedy, output, gold, NULL);
// //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
//
// myDump(dedy, "dedy.txt", "dedy");
//
// _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
// _Sub(output, gold, dedxTmp);
//
// myDump(dedx, "dedx.txt", "dedx");
// dedx->Dump(stderr, "dedx", 200);
// dedxTmp->Dump(stderr, "dedxTmp", 200);
//
// input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
// dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
//
// _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
//
// myDump(dedw, "dedw.txt", "dedw");
//}
//
//void T2TTest2()
//{
// int dimSize[3];
// dimSize[0] = 161;
// dimSize[1] = 47;
// dimSize[2] = 10001;
// XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
// //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//
// //myRead(probs, "probs.txt", " ");
// _SetDataFixedFloat(probs, 1.0F);
//
// probs->Reshape(1, probs->unitNum);
//
// DTYPE sum = _ReduceSumAll(probs);
// printf("%e\n", sum);
//
// //XTensor tmp;
// //tmp = IsNonZero(*probs);
// //DTYPE nonZeroNum = ReduceSumAll(tmp);
// //printf("%f\n", nonZeroNum);
// //
// //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//
// //printf("%e\n", gpu);
//}
...@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, ...@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
delete[] dims; delete[] dims;
} }
if(padding != NULL) { //if(padding != NULL) {
XTensor * tmp = NewTensor(padding); // XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp); // _IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp); // int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp; // delete tmp;
} //}
else { //else {
int num = dedy->unitNum / dedy->GetDim(n); // int num = dedy->unitNum / dedy->GetDim(n);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
} //}
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论