Commit 7809ed05 by xuchen

Merge branch 'xuchen' into xiaotong-working

parents f4be1882 03a9836e
......@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
......@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else{
ShowNTErrors("Wrong activation function type!");
......
......@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName)
{
......@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
......@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
......@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
_LossBackward(dedy, gold, y, lossName);
//_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
}
}
\ No newline at end of file
......@@ -36,13 +36,13 @@ class XLossGrad
public:
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName);
};
......
......@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
DelTensorBuf(b);
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......
......@@ -55,7 +55,7 @@ void XNetClearAll()
XNet::XNet()
{
nodes.Clear();
isGradEfficient = true;
isGradEfficient = false;
}
/* de-constructor */
......@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
XList golds(1);
golds.Add(&gold);
Backward(roots, golds, loss);
XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
{
XList roots(1);
roots.Add(&root);
XList golds(1);
golds.Add(&gold);
XList paddings(1);
paddings.Add(&padding);
Backward(roots, golds, paddings, loss);
}
/*
......@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList golds(1);
golds.Add(NULL);
Backward(roots, golds, loss);
XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
......@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> gold - a list of gold standard for the output
>> padding - specify a target value that is ignored
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
{
Traverse(roots);
......@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income;
int funcID = income.typeID;
void * params = income.params;
......@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
root->visitMark = NODE_FINISHED;
if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED;
}
else {
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, padding, loss);
}
}
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
else{
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, loss);
lossGrad.Compute(gold, root, root->grad, NULL, loss);
}
}
......@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
/*
backward propagation to obtain gradient
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> roots - a list of root nodes (output) of the network
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
{
XList golds(roots.count);
for(int i = 0; i < roots.count; i++)
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++) {
golds.Add(NULL);
paddings.Add(NULL);
}
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
{
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++)
paddings.Add(NULL);
Backward(roots, golds, loss);
Backward(roots, golds, paddings, loss);
}
/*
......
......@@ -62,17 +62,24 @@ struct XNet
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false);
......
......@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if(isEnd)
break;
Test(testFN, outputFN, model);
}
double elapsed = GetClockSec() - startT;
......@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......
......@@ -68,9 +68,10 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
}
/*
make positional embeddings (of size eSize * length
eSize - embedding size
length - length of the sequenc
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{
......@@ -114,15 +115,15 @@ make the network
*/
XTensor T2TEmbedder::Make(XTensor &input)
{
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
//CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize;
dims[input.order] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
......@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
......@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
}
/* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
......
......@@ -121,14 +121,22 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
XTensor encoding;
/* generate mask to see "previous" words only */
int len = input.GetDim(input.order - 2);
int * dims = new int[input.order + 1];
//int len = input.GetDim(input.order - 2);
//int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
dims[0] = nhead;
dims[input.order] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
dims[input.order + 1] = len;
XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
......@@ -140,24 +148,24 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//_Sum(&mask, padding3, &mask);
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.denseRatio, padding.devID, padding.mem);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining);
outputLayer.Make(encoding, output);
......@@ -165,8 +173,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
delete[] dims;
delete[] dimsPadding;
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
DelTensorBuf(padding3);
}
/*
......@@ -235,8 +243,8 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding2);
DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
......
......@@ -93,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{
XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1);
//output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
}
}
......@@ -103,6 +103,10 @@ public:
/* indicates whether we use adam */
bool useAdam;
int validStep;
int curEpoch;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
......@@ -131,7 +135,7 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we double the </s> symble for the output of lms */
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
......@@ -150,7 +154,7 @@ public:
void Init(int argc, char ** argv);
/* train the model */
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
......@@ -172,7 +176,28 @@ public:
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
......
......@@ -25,6 +25,8 @@
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer
{
......@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
if(strcmp(trainFN, "")) {
double startT = GetClockSec();
T2TTrainer trainer;
trainer.Init(argc, args);
char * fn = new char[MAX_LINE_LENGTH];
char * fn1 = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
int epoch;
bool isTrain;
for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
if(epoch == 1) {
T2TModel model;
model.InitModel(argc, args);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
else {
T2TModel model;
model.InitModel(argc, args);
model.Read(fn);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
T2TTrainer tester;
tester.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
model.Read(fn1);
tester.Test(testFN, fn2, &model);
}
if(!isTrain)
break;
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, trainer.nepoch);
XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);
delete[] fn;
delete[] fn1;
delete[] fn2;
}
/* don't dump the final model */
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
//if(strcmp(modelFN, "") && strcmp(trainFN, ""))
// model.Dump(modelFN);
T2TModel model;
model.InitModel(argc, args);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
......
......@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] &&
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
blockSize[0] = bXSize;
blockSize[1] = bYSize;
......
......@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
cublasDestroy(cublasHandle);
if(myIsForComputation)
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
SetDevice(devIDBackup);
#endif
......@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
"Cannot destroy the cublas handle.");
}
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
#endif
}
......
......@@ -35,6 +35,8 @@ const char * GetOPName(int type)
return "M_EXP";
else if (type == MATH_FLOOR)
return "M_FLOOR";
else if (type == MATH_ISNONZERO)
return "M_ISNONZERO";
else if (type == MATH_ISZERO)
return "M_ISZERO";
else if (type == MATH_LOG)
......
......@@ -35,7 +35,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_CEIL MATH_ABSOLUTE + 1
#define MATH_EXP MATH_CEIL + 1
#define MATH_FLOOR MATH_EXP + 1
#define MATH_ISZERO MATH_FLOOR + 1
#define MATH_ISNONZERO MATH_FLOOR + 1
#define MATH_ISZERO MATH_ISNONZERO + 1
#define MATH_LOG MATH_ISZERO + 1
#define MATH_SQRT MATH_LOG + 1
#define MATH_SQUARE MATH_SQRT + 1
......
......@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
/*
set the value of a cell
>> value - value to assign to the cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>>
>> size - size of the index
*/
bool XTensor::Set(DTYPE value, int index[], int size)
{
......@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
/*
set the value of a cell in a 1d tensor
>> value - value to assign to the cell
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1D(DTYPE value, int i)
{
......@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
return SetToDevice(devID, GetCell(dims, 3), value);
}
/*
set the integer value of a cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>> size - size of the index
<< return - succeeded or not
*/
bool XTensor::SetInt(int value, int index[], int size)
{
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
return SetToDeviceInt(devID, GetCell(index, size), value);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1DInt(int value, int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[1] = {i};
return SetToDeviceInt(devID, GetCell(dims, 1), value);
}
/*
set the integer value of a cell in a 2d tensor in default type
>> value - value we tend to set
>> ni - row index
>> mi - column index
<< return - succeeded or not
*/
bool XTensor::Set2DInt(int value, int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[2] = {ni, mi};
return SetToDeviceInt(devID, GetCell(dims, 2), value);
}
/*
set the integer value of a cell in a 3d tensor in default type
>> value - value we tend to set
>> d0 - index of demension 0
>> d1 - index of demension 1
>> d2 - index of demension 2
<< return - succeeded or not
*/
bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[3] = {d0, d1, d2};
return SetToDeviceInt(devID, GetCell(dims, 3), value);
}
/*
increase the value of a cell in a 2d tensor
>> value - value we tend to set
......@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
}
if(myMem != NULL)
tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
else
......@@ -2135,7 +2211,7 @@ generate a copy of XTensor
>> isFilledData - indicates whether we allocate the data for
the newly-generated tensor
*/
XTensor * NewTensor(XTensor * a, bool isFilledData)
XTensor * NewTensor(const XTensor * a, bool isFilledData)
{
int dims[MAX_TENSOR_DIM_NUM];
......
......@@ -326,6 +326,18 @@ public:
/* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
/* set the integer value of a cell in a 2d tensor */
bool Set2DInt(int value, int ni, int mi);
/* set the integer value of a cell in a 3d tensor */
bool Set3DInt(int value, int d0, int d1, int d2);
/* increase the value of a cell in a 2d */
bool Add2D(DTYPE value, int ni, int mi);
......@@ -450,7 +462,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
const int myDevID = -1, XMem * myMem = NULL);
/* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(XTensor * a, bool isFilledData = true);
XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
/* free the data space of a given tensor */
void DelTensor(XTensor * tensor);
......
......@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return true;
}
/* assign a integer number to a variable that is kept on a specified device */
bool SetToDeviceInt(int devID, void * p, int value)
{
if(p == NULL)
return false;
if(devID < 0)
*(int*)p = value;
else{
XMemCopy(p, devID, &value, -1, sizeof(int));
}
return true;
}
/* get the next number with power of 2 */
unsigned int GetNextPower2(unsigned int n)
{
......
......@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value);
extern bool SetToDeviceInt(int devID, void * p, int value);
extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime);
extern double GetClock();
......
......@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize;
}
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a);
DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
tensor->SetDataRand(-finfout, finfout);
//_SetDataRand(tensor, -finfout, finfout);
}
/*
......@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if(tensor == NULL)
return;
/* GPU code */
/* CPU code */
if(tensor->devID < 0){
DTYPE variance = upper - lower;
......
......@@ -37,6 +37,11 @@ DTYPE round(DTYPE r)
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
}
DTYPE isnonzero(DTYPE r)
{
return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
}
DTYPE iszero(DTYPE r)
{
return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
......@@ -93,6 +98,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
......@@ -173,6 +182,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
......
......@@ -41,11 +41,18 @@ DTYPE cudaround(DTYPE r)
}
__device__
DTYPE cudaisnonzero(DTYPE r)
{
return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
}
__device__
DTYPE cudaiszero(DTYPE r)
{
return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
}
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
......@@ -96,6 +103,7 @@ SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
SIMPLE_UNARY_FUNCTION_GPU(Log, log)
SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
......
......@@ -66,6 +66,15 @@ void KernelFloor(__half * a, __half * b, int size);
/* set each entry to its floor value */
void _CudaFloor(const XTensor * a, XTensor * b);
/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
__global__
void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
__global__
void KernelIsNonZero(__half * a, __half * b, int size);
/* if source entry is non-zero, set target entry to be one, otherwise zero */
void _CudaIsNonZero(const XTensor * a, XTensor * b);
/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
__global__
void KernelIsZero(DTYPE * a, DTYPE * b, int size);
......
......@@ -63,6 +63,15 @@ void _FloorMe(XTensor * a);
make a new tensor to keep the result and return it */
XTensor Floor(const XTensor & a);
/* if source entry is non-zero, set target entry to be one, otherwise zero */
void _IsNonZero(const XTensor *a, XTensor *b);
/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
keep the result in the input tensor a and return nothing */
void _IsNonZeroMe(XTensor *a);
/* if source entry is non-zero, set target entry to be one, otherwise zero (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor IsNonZero(const XTensor &a);
/* if source entry is zero, set target entry to be one, otherwise zero */
void _IsZero(const XTensor *a, XTensor *b);
/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
......
......@@ -21,6 +21,8 @@
#include "Gather.h"
#include "CopyIndexed.h"
#include "../../XUtility.h"
#include "../shape/Reshape.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
return result;
}
/*
gather indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor(2D)
>> index - the index tensor
<< return - the result of copying indexed sub-tensors
*/
XTensor Gather(const XTensor &s, const XTensor &index)
{
int indexSize = index.unitNum;
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int * srcIndex = new int[index.unitNum];
if(index.dataType == X_INT) {
XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
}
else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
DTYPE * tmp = new DTYPE[indexSize];
XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
for(int i = 0; i < indexSize; i++)
srcIndex[i] = (int)tmp[i];
delete[] tmp;
}
XTensor tensor;
tensor = Gather(s, 0, srcIndex, indexSize);
delete[] srcIndex;
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = tensor.GetDim(-1);
XTensor t;
t = Reshape(tensor, index.order + 1, dims);
delete[] dims;
return t;
}
else {
return tensor;
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, const XTensor &index);
} // namespace nts(NiuTrans.Tensor)
#endif // __GATHER_H__
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include <math.h>
#include "ReduceSum.h"
......
......@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor
<< return - the total summation
*/
DTYPE _ReduceSumAll(XTensor * source)
DTYPE _ReduceSumAll(const XTensor * source)
{
int order = source->order;
DTYPE summation;
XTensor * big = NewTensor(source);
_CopyValues(source, big);
for(int i = 0; i < order; i++) {
if(i == order - 1)
big->Reshape(big->unitNum, 1);
for(int i = order - 1; i >= 0; i--) {
if(i == 0)
big->Reshape(1, big->unitNum);
int leadingDim = big->order - 1;
int * dimSize;
dimSize = getDimSize(big, 0);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
_ReduceSum(big, little, 0);
_ReduceSum(big, little, leadingDim);
delete big;
delete dimSize;
......@@ -81,7 +82,7 @@ sum all the items of the tensor
>> source - the inpute tensor
<< return - the total summation
*/
DTYPE ReduceSumAll(XTensor & source)
DTYPE ReduceSumAll(const XTensor & source)
{
return _ReduceSumAll(&source);
}
......
......@@ -28,10 +28,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */
DTYPE _ReduceSumAll(XTensor * source);
DTYPE _ReduceSumAll(const XTensor * source);
/* sum all the items of the tensor */
DTYPE ReduceSumAll(XTensor & source);
DTYPE ReduceSumAll(const XTensor & source);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -29,20 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compute the cross entropy loss */
void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1);
XTensor * loss, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1);
/* compute the cross entropy loss */
DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1);
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1);
/* backward computation of cross entropy function */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold,
const XTensor * weight = NULL, XTensor * padding = NULL,
int leadingDim = -1);
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL,
XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor)
#endif // __CROSSENTROPY_CUH__
#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
......@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
const XTensor * padding = NULL, int leadingDim = -1);
/* backward computation of cross entropy function */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold,
const XTensor * weight = NULL, XTensor * padding = NULL,
int leadingDim = -1);
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL,
XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -279,8 +279,8 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
*/
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
......@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return;
}
#endif
......
......@@ -22,6 +22,7 @@
#include "LogSoftmax.h"
#include "LogSoftmax.cuh"
#include "Loss.cuh"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
......@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
>> lossName - name of the loss function
*/
__global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
int size, LOSS_FUNCTION_NAME lossName)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -371,10 +373,12 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
*/
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
leadDim = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU.");
......@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dimensionSize * stride, lossName);
}
}
if(padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else {
ShowNTErrors("TODO!");
......
......@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
......
......@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
for (int i = 0; i < blockNum; i++) {
for (int j = 0; j < stride; j++) {
for (int k = 0; k < tLen; k++) {
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
+ j + stride * (tBeg + k)) / (DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) =
-(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) /
(DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
}
}
}
......
......@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
>> leadDim - leading dimension (along which we perform reduction)
*/
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
......@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA
if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return;
}
#endif
......@@ -297,9 +297,10 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
\beta = \sum_i (dE/dy_i * y_i)
*/
for(int k = 0; k < blockNum; k++){
op = (DTYPE*)y->data + k * blockSize;
sp = (DTYPE*)dedx->data + k * blockSize;
for(int m = 0; m < blockNum; m++){
yp = (DTYPE*)dedy->data + m * blockSize;
op = (DTYPE*)y->data + m * blockSize;
sp = (DTYPE*)dedx->data + m * blockSize;
int nCols = stride;
for(int k = 0; k < stride; k++){
......
......@@ -24,6 +24,7 @@
#include "Loss.cuh"
#include "../core/reduce/ReduceSum.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/shape/Unsqueeze.h"
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
......@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
*/
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
int n = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((y->order >= 1), "Empty tensor!");
......@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F);
if(padding != NULL) {
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!");
......
......@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
/* de/dx (Cuda version) */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
......
......@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -169,8 +169,8 @@ bool TestDropout2()
_DropoutBackward(y, x, dedy, dedx, 1, dropProb);
/* check result */
y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy");
//y->Dump(stderr, "y");
//dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
/* GPU test */
......@@ -193,8 +193,8 @@ bool TestDropout2()
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);
/* check result */
yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU");
//yGPU->Dump(stderr, "yGPU");
//dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
delete x;
......
......@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
_LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
_LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
......@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
_LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
_LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
......@@ -66,7 +66,9 @@ bool TestPower1()
bUser = Power(*a, 2.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -88,7 +90,9 @@ bool TestPower1()
bUserGPU = Power(*aGPU, 2.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......@@ -153,7 +157,9 @@ bool TestPower2()
bUser = Power(*a, 1.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -175,7 +181,9 @@ bool TestPower2()
bUserGPU = Power(*aGPU, 1.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......@@ -214,7 +222,7 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 1.0F},
DTYPE aData[3][2] = { {1.0F, 1.0F},
{2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0F, 1.0F},
......@@ -240,7 +248,9 @@ bool TestPower3()
bUser = Power(*a, 0.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -262,7 +272,9 @@ bool TestPower3()
bUserGPU = Power(*aGPU, 0.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......
......@@ -146,7 +146,7 @@ bool TestSoftmax2()
_Softmax(x, y, 1);
/* call SoftmaxBackward function */
_SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
_SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -174,7 +174,7 @@ bool TestSoftmax2()
_Softmax(xGPU, yGPU, 1);
/* call SoftmaxBackward function */
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
......@@ -20,8 +20,9 @@
*/
#include "TSumDim.h"
#include "../core/arithmetic/SumDim.h"
#include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -251,6 +252,225 @@ bool TestSumDim2()
#endif // USE_CUDA
}
/*
case 3: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim3()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
*/
bool TestSumDim4()
{
/* a tensor of size (200, 40, 4000) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 1000000;
aDimSize[1] = 50;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 50;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -279,6 +499,24 @@ bool TestSumDim()
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSumDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
///* case 4 test */
//caseFlag = TestSumDim4();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 4 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论