Commit f21e1b48 by xiaotong

work on the triaining processing on t2t lm

parent 5cd1be65
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../sample/fnnlm/FNNLM.h" #include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
...@@ -35,6 +36,7 @@ void SumDimTest(); ...@@ -35,6 +36,7 @@ void SumDimTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
using namespace transformer;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
...@@ -44,10 +46,11 @@ int main( int argc, const char ** argv ) ...@@ -44,10 +46,11 @@ int main( int argc, const char ** argv )
//SumDimTest(); //SumDimTest();
//return 0; //return 0;
if(argc > 1 && !strcmp(argv[1], "-test"))
1;//Test(); if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else{ else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n"); fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n"); fprintf(stderr, "neural networks in an easy way. \n\n");
......
...@@ -614,7 +614,7 @@ void XMathGrad::GradNormalize(XTensor * node) ...@@ -614,7 +614,7 @@ void XMathGrad::GradNormalize(XTensor * node)
XTensor * p = NewTensor(a); XTensor * p = NewTensor(a);
XTensor * q = NewTensor(a); XTensor * q = NewTensor(a);
XTensor * r = NewTensor(a); XTensor * r = NewTensor(a);
DTYPE epsilon = income.GetParamInt(0); DTYPE epsilon = income.GetParam(0);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
...@@ -742,7 +742,7 @@ void XMathGrad::GradReduceMean(XTensor * node) ...@@ -742,7 +742,7 @@ void XMathGrad::GradReduceMean(XTensor * node)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_ScaleAndShift(b, c, 1 / n); _ScaleAndShift(b, c, 1.0F/n);
_Sum(a->grad, c, a->grad); _Sum(a->grad, c, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
......
...@@ -258,10 +258,11 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code) ...@@ -258,10 +258,11 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
if(node == NULL) if(node == NULL)
return; return;
//fprintf(stderr, "%d\n", node->id);
if(node->visitMark == code + 1){ if(node->visitMark == code + 1){
ShowNTErrors("There is a circle in the network\n"); ShowNTErrors("There is a circle in the network\n");
} }
else if(node->visitMark <= code || node->visitMark >= code + 2){ else if(node->visitMark <= code){
node->visitMark = code + 1; node->visitMark = code + 1;
XLink &income = node->income; XLink &income = node->income;
for(int i = 0; i < income.tailNum; i++){ for(int i = 0; i < income.tailNum; i++){
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <math.h> #include <math.h>
#include "T2TAttention.h" #include "T2TAttention.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -56,9 +57,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m ...@@ -56,9 +57,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "dk", &dk, 512); LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "dv", &dv, 512); LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, 512); LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
...@@ -104,12 +105,12 @@ XTensor * T2TAttention::Make(XTensor * k, XTensor * q, XTensor * v) ...@@ -104,12 +105,12 @@ XTensor * T2TAttention::Make(XTensor * k, XTensor * q, XTensor * v)
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1); scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1);
att = MMul(scalar, vheads); att = BMMul(scalar, vheads);
XTensor * result = new XTensor(); XTensor * result = new XTensor();
/* concatenate the heads */ /* concatenate the heads */
*result = Merge(att, -1); *result = Merge(att, att.order - 1);
return result; return result;
} }
......
...@@ -57,7 +57,8 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -57,7 +57,8 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 256); LoadParamInt(argc, argv, "maxlen", &maxLength, 256);
LoadParamInt(argc, argv, "d", &d, 256); LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
...@@ -74,9 +75,9 @@ length - length of the sequenc ...@@ -74,9 +75,9 @@ length - length of the sequenc
*/ */
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{ {
InitTensor2D(&posEmbedding, length, eSize, X_FLOAT, devID, mem); InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
float * data = new float[posEmbedding.unitNum]; float * data = new float[posEmbeddingBase.unitNum];
for(int pos = 0; pos < length; pos++){ for(int pos = 0; pos < length; pos++){
float * dp = data + pos * eSize; float * dp = data + pos * eSize;
...@@ -92,7 +93,7 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) ...@@ -92,7 +93,7 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
} }
} }
posEmbedding.SetData(data, posEmbedding.unitNum); posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
delete[] data; delete[] data;
} }
...@@ -105,11 +106,12 @@ XTensor * T2TEmbedder::Make(XTensor * input) ...@@ -105,11 +106,12 @@ XTensor * T2TEmbedder::Make(XTensor * input)
CheckNTErrors(input->GetDim(-1) == vSize, "Wrong vocabulary size!"); CheckNTErrors(input->GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input->order > 1, "Wrong input tensor size!"); CheckNTErrors(input->order > 1, "Wrong input tensor size!");
CheckNTErrors(input->dimSize[input->order - 2] < maxLength, "The sequence is too long!"); CheckNTErrors(input->dimSize[input->order - 2] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input->dimSize, input->order); memcpy(dims, input->dimSize, input->order * sizeof(int));
dims[0] = eSize; dims[input->order - 1] = eSize;
bool match = (posEmbedding.order == input->order); bool match = (posEmbedding.order == input->order);
if(match){ if(match){
...@@ -122,17 +124,10 @@ XTensor * T2TEmbedder::Make(XTensor * input) ...@@ -122,17 +124,10 @@ XTensor * T2TEmbedder::Make(XTensor * input)
/* we make positional embeddings first */ /* we make positional embeddings first */
if(!match){ if(!match){
InitTensor(&posEmbedding, input->order, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(&posEmbedding, input->order, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims, X_FLOAT, 1.0F, devID, mem); XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
int dims2[MAX_TENSOR_DIM_NUM]; _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
dims2[0] = dims[0]; _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
dims2[1] = dims[1];
dims2[2] = posEmbedding.unitNum / (dims[0] * dims[1]);
posEmbedding.Reshape(3, dims2);
_Unsqueeze(posTMP, &posEmbedding, 0, dims2[2]);
posEmbedding.Reshape(input->order, dims);
DelTensorBuf(posTMP); DelTensorBuf(posTMP);
} }
......
...@@ -29,6 +29,8 @@ using namespace nts; ...@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer namespace transformer
{ {
#define DEFAULT_BEDDING_SIZE 512
/* /*
embedding (of word at position i): embedding (of word at position i):
word embedding + positional embedding word embedding + positional embedding
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "T2TFNN.h" #include "T2TFNN.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
#include "../../tensor/function/FHeader.h" #include "../../tensor/function/FHeader.h"
...@@ -54,9 +55,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -54,9 +55,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, 512); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, 512); LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, 512); LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem); InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "T2TLayerNormal.h" #include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
{ {
...@@ -58,7 +59,32 @@ y = ...@@ -58,7 +59,32 @@ y =
*/ */
XTensor * T2TLN::Make(XTensor * input) XTensor * T2TLN::Make(XTensor * input)
{ {
return NULL; XTensor &x = *input;
XTensor mean;
XTensor variance;
XTensor standard;
XTensor meanFilled;
XTensor standardFilled;
XTensor * result = new XTensor();
/* \mu = (sum_i x_i)/m */
mean = ReduceSum(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean);
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same size of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
*result = (x - meanFilled)/standardFilled;
return result;
} }
} }
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "T2TOutput.h" #include "T2TOutput.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -53,10 +54,11 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe ...@@ -53,10 +54,11 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
mem = myMem; mem = myMem;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "hsize", &inSize, 512); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "hsize", &hSize, 512); LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE);
}
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
}
/* /*
make the network make the network
......
...@@ -31,6 +31,8 @@ namespace transformer ...@@ -31,6 +31,8 @@ namespace transformer
/* constructor */ /* constructor */
T2TTrainer::T2TTrainer() T2TTrainer::T2TTrainer()
{ {
devID = -1;
mem = NULL;
seqLen = NULL; seqLen = NULL;
nseqBuf = 0; nseqBuf = 0;
nextSeq = -1; nextSeq = -1;
...@@ -50,18 +52,19 @@ initialization ...@@ -50,18 +52,19 @@ initialization
*/ */
void T2TTrainer::Init(int argc, const char ** argv) void T2TTrainer::Init(int argc, const char ** argv)
{ {
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamFloat(argc, argv, "lrate", &lrate, 0.001F); LoadParamFloat(argc, argv, "lrate", &lrate, 0.001F);
LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1); LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1); LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
LoadParamInt(argc, argv, "nepoch", &nepoch, 1); LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
LoadParamInt(argc, argv, "nstep", &nstep, 1); LoadParamInt(argc, argv, "nstep", &nstep, 1);
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false); LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
int maxUnitInBuf; buf = new int[bufSize];
LoadParamInt(argc, argv, "bufsize", &maxUnitInBuf, 20000); seqLen = new int[bufSize];
buf = new int[maxUnitInBuf]; seqOffset = new int[bufSize];
seqLen = new int[maxUnitInBuf];
seqOffset = new int[maxUnitInBuf];
} }
/* /*
...@@ -79,6 +82,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -79,6 +82,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
bool isEnd = false; bool isEnd = false;
float loss = 0; float loss = 0;
XNet net;
double startT = GetClockSec(); double startT = GetClockSec();
for(epoch = 0; epoch < nepoch; epoch++){ for(epoch = 0; epoch < nepoch; epoch++){
...@@ -99,6 +104,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -99,6 +104,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* make the network */ /* make the network */
model->Make(&batch, &output); model->Make(&batch, &output);
/* back-propagation for obtaining gradients */
net.Backward(output, batch, CROSSENTROPY);
/* TODO: update the model!!!! */ /* TODO: update the model!!!! */
/* get probabilities */ /* get probabilities */
...@@ -188,7 +196,7 @@ int T2TTrainer::LoadBuf(FILE * file) ...@@ -188,7 +196,7 @@ int T2TTrainer::LoadBuf(FILE * file)
wordCount += wNum; wordCount += wNum;
lineCount++; lineCount++;
if(wordCount >= wBatchSize || lineCount >= sBatchSize) if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
break; break;
} }
...@@ -211,24 +219,28 @@ load a batch of sequences ...@@ -211,24 +219,28 @@ load a batch of sequences
*/ */
int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount) int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount)
{ {
if(nextSeq >= nseqBuf) if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file); LoadBuf(file);
int seq = nextSeq; int seq = MAX(nextSeq, 0);
int wc = 0; int wc = 0;
int wn = 0;
int sc = 0; int sc = 0;
int max = 0; int max = 0;
while(seq < nseqBuf){ while(seq + sc < nseqBuf){
wc += seqLen[seq]; wn = seqLen[seq + sc];
wc += wn;
sc += 1; sc += 1;
if(max < wc) if(max < wn)
max = wc; max = wn;
if(sc >= sBatch && wc >= wBatch) if(sc >= sBatch && wc >= wBatch)
break; break;
} }
nextSeq = seq + sc;
if(sc > 0){ if(sc > 0){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc; dims[0] = sc;
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "../../tensor/function/FHeader.h" #include "../../tensor/function/FHeader.h"
#define MAX_SEQUENCE_LENGTH 1024 * 64 #define MAX_SEQUENCE_LENGTH 1024 * 4
using namespace nts; using namespace nts;
...@@ -46,6 +46,9 @@ public: ...@@ -46,6 +46,9 @@ public:
/* buffer for loading words */ /* buffer for loading words */
int * buf; int * buf;
/* buffer size */
int bufSize;
/* length of each sequence */ /* length of each sequence */
int * seqLen; int * seqLen;
......
...@@ -34,8 +34,8 @@ void LoadParamString(int argc, const char ** argv, const char * name, char * p, ...@@ -34,8 +34,8 @@ void LoadParamString(int argc, const char ** argv, const char * name, char * p,
bool hit = false; bool hit = false;
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){ if(!strcmp(argv[i], vname) && i + 1 < argc){
*(int*)p = atoi(argv[i + 1]); strcpy(p, argv[i + 1]);
fprintf(stderr, " %s=%s\n", name, argv[i + 1]); //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true; hit = true;
} }
} }
...@@ -52,7 +52,7 @@ void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int ...@@ -52,7 +52,7 @@ void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){ if(!strcmp(argv[i], vname) && i + 1 < argc){
*(int*)p = atoi(argv[i + 1]); *(int*)p = atoi(argv[i + 1]);
fprintf(stderr, " %s=%s\n", name, argv[i + 1]); //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true; hit = true;
} }
} }
...@@ -69,7 +69,8 @@ void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bo ...@@ -69,7 +69,8 @@ void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bo
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname)){ if(!strcmp(argv[i], vname)){
*(bool*)p = true; *(bool*)p = true;
fprintf(stderr, " %s=%s\n", name, "true"); //fprintf(stderr, " %s=%s\n", name, "true");
hit = true;
} }
} }
if(!hit) if(!hit)
...@@ -84,12 +85,27 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, ...@@ -84,12 +85,27 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
bool hit = false; bool hit = false;
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){ if(!strcmp(argv[i], vname) && i + 1 < argc){
strcpy((char*)p, argv[i + 1]); *p = (float)atof(argv[i + 1]);
fprintf(stderr, " %s=%s\n", name, argv[i + 1]); //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
} }
} }
if(!hit) if(!hit)
*p = defaultP; *p = defaultP;
} }
void ShowParams(int argc, const char ** argv)
{
fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){
if(argv[i][0] == '-'){
if(i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
} }
\ No newline at end of file
...@@ -27,12 +27,15 @@ ...@@ -27,12 +27,15 @@
namespace transformer namespace transformer
{ {
/* load model parameters */ /* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, char * defaultP); void LoadParamString(int argc, const char ** argv, const char * name, char * p, char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP); void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP); void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP); void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP);
/* show arguments */
void ShowParams(int argc, const char ** argv);
} }
#endif #endif
\ No newline at end of file
...@@ -20,12 +20,36 @@ ...@@ -20,12 +20,36 @@
*/ */
#include "Transformer.h" #include "Transformer.h"
#include "T2TModel.h"
#include "T2TUtility.h"
#include "T2TTrainer.h"
namespace transformer namespace transformer
{ {
int TransformerMain(int argc, const char ** argv) int TransformerMain(int argc, const char ** argv)
{ {
if(argc == 0)
return 1;
ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, argv, "train", trainFN, "");
T2TModel model;
model.InitModel(argc, argv);
if(strcmp(trainFN, "")){
T2TTrainer trainer;
trainer.Init(argc, argv);
trainer.Train(trainFN, &model);
}
delete[] trainFN;
return 0; return 0;
} }
......
...@@ -38,7 +38,7 @@ namespace transformer ...@@ -38,7 +38,7 @@ namespace transformer
{ {
/* entrance of the program */ /* entrance of the program */
int TransformerMMain(int argc, const char ** argv); int TransformerMain(int argc, const char ** argv);
} }
......
...@@ -42,6 +42,8 @@ ...@@ -42,6 +42,8 @@
#include "core/movement/CopyValues.h" #include "core/movement/CopyValues.h"
#include "core/arithmetic/Sum.h" #include "core/arithmetic/Sum.h"
#include "core/arithmetic/Multiply.h" #include "core/arithmetic/Multiply.h"
#include "core/arithmetic/Sub.h"
#include "core/arithmetic/Div.h"
#include "core/math/ScaleAndShift.h" #include "core/math/ScaleAndShift.h"
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -354,6 +356,18 @@ XTensor XTensor::operator* (const XTensor& tensor) ...@@ -354,6 +356,18 @@ XTensor XTensor::operator* (const XTensor& tensor)
return Multiply(*this, tensor); return Multiply(*this, tensor);
} }
/* overloading of the minus-sign */
XTensor XTensor::operator- (const XTensor& tensor)
{
return Sub(*this, tensor);
}
/* overloading of the division-sign */
XTensor XTensor::operator/ (const XTensor& tensor)
{
return Div(*this, tensor);
}
/* /*
linear transformation b = a * \scale + \shift linear transformation b = a * \scale + \shift
>> scale - the slope >> scale - the slope
...@@ -610,8 +624,8 @@ double GaussRand() ...@@ -610,8 +624,8 @@ double GaussRand()
double pi = 3.141592654; double pi = 3.141592654;
if (phase == 0){ if (phase == 0){
u = rand() / (RAND_MAX + 1.0); u = (rand() + 1) / (RAND_MAX + 1.0);
v = rand() / (RAND_MAX + 1.0); v = (rand() + 1) / (RAND_MAX + 1.0);
z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v); z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
} }
else{ else{
...@@ -1008,8 +1022,8 @@ set the value of a cell in a 3d tensor in default type ...@@ -1008,8 +1022,8 @@ set the value of a cell in a 3d tensor in default type
bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2) bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
{ {
CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!"); CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((d0 >= 0 && d1 < dimSize[0]), "dimension 0 is out of range!"); CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((d2 >= 0 && d2 < dimSize[1]), "dimension 1 is out of range!"); CheckNTErrors((d2 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 1 is out of range!"); CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 1 is out of range!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type."); CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
......
...@@ -203,6 +203,12 @@ public: ...@@ -203,6 +203,12 @@ public:
/* overloading of the multiply-sign */ /* overloading of the multiply-sign */
XTensor operator* (const XTensor &tensor); XTensor operator* (const XTensor &tensor);
/* overloading of the minus-sign */
XTensor operator- (const XTensor &tensor);
/* overloading of the division-sign */
XTensor operator/ (const XTensor &tensor);
/* linear transformation */ /* linear transformation */
XTensor Lin(DTYPE scale, DTYPE shift = 0); XTensor Lin(DTYPE scale, DTYPE shift = 0);
......
...@@ -251,9 +251,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, ...@@ -251,9 +251,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
/* /*
matrix multiplication with no transposition c = a * b * alpha matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient >> alpha - a coefficient
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
<< return - the result of matrix multiplication << return - the result of matrix multiplication
......
...@@ -326,4 +326,60 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const ...@@ -326,4 +326,60 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
return c; return c;
} }
/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
/*call _MatrixMulBatched function */
_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -73,6 +73,17 @@ where trans() returns the transposed matrix if the flag is fired ...@@ -73,6 +73,17 @@ where trans() returns the transposed matrix if the flag is fired
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
/*
matrix multiplication of the two tensors (return a XTensor structure) c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
*/
XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHED_H__ #endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
...@@ -36,7 +36,7 @@ copy s to t ...@@ -36,7 +36,7 @@ copy s to t
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream) void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
{ {
CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!"); CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
CheckNTErrors((s->data != NULL), "Cannot copy from an empty data array!"); CheckNTErrors((s->data != NULL), "Cannot copy an empty data array!");
CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!"); CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!"); CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");
...@@ -82,7 +82,7 @@ copy s to t ...@@ -82,7 +82,7 @@ copy s to t
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream) void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
{ {
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!"); CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->data != NULL && t->data != NULL, "Cannot copy from an empty data array!"); CheckNTErrors(s->data != NULL && t->data != NULL, "Cannot copy an empty data array!");
CheckNTErrors(s->unitSize == t->unitSize, "The input tensors must be of the same unit size!"); CheckNTErrors(s->unitSize == t->unitSize, "The input tensors must be of the same unit size!");
CheckNTErrors(s->order > sBeg && sBeg >= 0 && sLen <= s->unitNum, "Wrong segment on the source side"); CheckNTErrors(s->order > sBeg && sBeg >= 0 && sLen <= s->unitNum, "Wrong segment on the source side");
CheckNTErrors(t->order > tBeg && tBeg >= 0, "Wrong segment on the target side"); CheckNTErrors(t->order > tBeg && tBeg >= 0, "Wrong segment on the target side");
......
...@@ -168,6 +168,8 @@ make a new tensor to keep the result and return it ...@@ -168,6 +168,8 @@ make a new tensor to keep the result and return it
XTensor Split(const XTensor &s, int whereToSplit, int splitNum) XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
{ {
CheckNTErrors(&s, "Invalid tensors!"); CheckNTErrors(&s, "Invalid tensors!");
CheckNTErrors(s.dimSize[whereToSplit] % splitNum == 0,
"The dimension cannot be splitted due to the inproper split number");
int order = s.order + 1; int order = s.order + 1;
int * dimSize = new int[order]; int * dimSize = new int[order];
......
...@@ -282,6 +282,9 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -282,6 +282,9 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!"); CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
CheckNTErrors((gold != NULL), "The gold standard cannot be empty!"); CheckNTErrors((gold != NULL), "The gold standard cannot be empty!");
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1; int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (gold->devID >= 0) { if (gold->devID >= 0) {
......
...@@ -188,7 +188,11 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -188,7 +188,11 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
CheckNTErrors((dedx->isSparse == false), "The gradient tensor must be dense!"); CheckNTErrors((dedx->isSparse == false), "The gradient tensor must be dense!");
CheckNTErrors((gold != NULL), "Incorrect x gold standard tensor!"); CheckNTErrors((gold != NULL), "Incorrect x gold standard tensor!");
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1; int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if(y->devID >= 0){ if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论