Commit 03a9836e by xuchen

1. add some base functions 2.better implementation for t2t

parent 52c0e35a
...@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_LOGSOFTMAX){ else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!"); CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
} }
else if(operID == FUNC_RECTIFY) else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS); _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
...@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_SOFTMAX){ else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!"); CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
} }
else{ else{
ShowNTErrors("Wrong activation function type!"); ShowNTErrors("Wrong activation function type!");
......
...@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x) ...@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy >> lossName - name of the loss, e.g., cross entropy
*/ */
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params, int funcID, void * params,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
...@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, ...@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
} }
else if(funcID == FUNC_LOGSOFTMAX){ else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params; int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
} }
else if(funcID == FUNC_RECTIFY){ else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName); _RectifyBackward(gold, y, x, dedy, dedx, lossName);
...@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, ...@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
_SigmoidBackward(gold, y, x, dedy, dedx, lossName); _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){ }else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params; int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
} }
else{ else{
ShowNTErrors("wrong function found when call the backward process!"); ShowNTErrors("wrong function found when call the backward process!");
...@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E ...@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy >> lossName - name of the loss, e.g., cross entropy
*/ */
void XLossGrad::Compute(XTensor * gold, XTensor * y, void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
_LossBackward(dedy, gold, y, lossName); //_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
} }
} }
\ No newline at end of file
...@@ -36,13 +36,13 @@ class XLossGrad ...@@ -36,13 +36,13 @@ class XLossGrad
public: public:
/* compute dE/dx for a given function y = f(x) */ /* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x, void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params, int funcID, void * params,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */ /* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y, void Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
}; };
......
...@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient) ...@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
DelTensorBuf(b); DelTensorBuf(b);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
......
...@@ -55,7 +55,7 @@ void XNetClearAll() ...@@ -55,7 +55,7 @@ void XNetClearAll()
XNet::XNet() XNet::XNet()
{ {
nodes.Clear(); nodes.Clear();
isGradEfficient = true; isGradEfficient = false;
} }
/* de-constructor */ /* de-constructor */
...@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss) ...@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
XList golds(1); XList golds(1);
golds.Add(&gold); golds.Add(&gold);
Backward(roots, golds, loss); XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
{
XList roots(1);
roots.Add(&root);
XList golds(1);
golds.Add(&gold);
XList paddings(1);
paddings.Add(&padding);
Backward(roots, golds, paddings, loss);
} }
/* /*
...@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss) ...@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList golds(1); XList golds(1);
golds.Add(NULL); golds.Add(NULL);
Backward(roots, golds, loss); XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
} }
/* /*
...@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function ...@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes with a number of root nodes
>> root - a list of root nodes (output) of the network >> root - a list of root nodes (output) of the network
>> gold - a list of gold standard for the output >> gold - a list of gold standard for the output
>> padding - specify a target value that is ignored
>> loss - name of loss function >> loss - name of loss function
*/ */
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
{ {
Traverse(roots); Traverse(roots);
...@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for(int i = 0; i < roots.count; i++){ for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i); XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i); XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income; XLink &income = root->income;
int funcID = income.typeID; int funcID = income.typeID;
void * params = income.params; void * params = income.params;
...@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
Note that we do not need to obtain dE/dy here because it is no use in the Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */ folloing process of back-propagation */
if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){ if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
XTensor * x = income.tails[0]; if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
XNoder::MakeGrad(x); XTensor * x = income.tails[0];
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss); XNoder::MakeGrad(x);
root->visitMark = NODE_FINISHED; lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED;
}
else {
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, padding, loss);
}
} }
/* we compuate dE/dy (y is the output) if no predefined activation function is used */ /* we compuate dE/dy (y is the output) if no predefined activation function is used */
else{ else{
XNoder::MakeGrad(root); XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, loss); lossGrad.Compute(gold, root, root->grad, NULL, loss);
} }
} }
...@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
/* /*
backward propagation to obtain gradient backward propagation to obtain gradient
with a number of root nodes with a number of root nodes
>> root - a list of root nodes (output) of the network >> roots - a list of root nodes (output) of the network
>> loss - name of loss function >> loss - name of loss function
*/ */
void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss) void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
{ {
XList golds(roots.count); XList golds(roots.count);
for(int i = 0; i < roots.count; i++) XList paddings(roots.count);
for(int i = 0; i < roots.count; i++) {
golds.Add(NULL); golds.Add(NULL);
paddings.Add(NULL);
}
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
{
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++)
paddings.Add(NULL);
Backward(roots, golds, loss); Backward(roots, golds, paddings, loss);
} }
/* /*
......
...@@ -62,17 +62,24 @@ struct XNet ...@@ -62,17 +62,24 @@ struct XNet
/* backward propagation to obtain gradient wrt. the loss/error function */ /* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */ /* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function /* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient /* backward propagation to obtain gradient
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */ /* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false); void BackwardNode(XTensor * node, bool isEfficent = false);
......
...@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if(isEnd) if(isEnd)
break; break;
Test(testFN, outputFN, model);
} }
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
...@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds /* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */ where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss); _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get /* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttDecoder::AttDecoder()
{
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor res;
XTensor nothing;
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
/* layer normalization */
x = attEndeLayerNorms[i].Make(res);
/*******/
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
return x;
}
}
...@@ -22,19 +22,33 @@ ...@@ -22,19 +22,33 @@
#ifndef __T2TDECODER_H__ #ifndef __T2TDECODER_H__
#define __T2TDECODER_H__ #define __T2TDECODER_H__
#include "T2TEncoder.h"
namespace transformer namespace transformer
{ {
class T2TDecoder class AttDecoder : public AttEncoder
{ {
public:
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
}; /* layer normalization for encoder-decoder attention */
T2TLN * attEndeLayerNorms;
class AttDecoder : T2TDecoder
{
public: public:
/* constructor */
AttDecoder();
/* deconstructor */
~AttDecoder();
/* initialize the model */ /* initialize the model */
void InitModel(int argc, char ** argv); void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
}; };
} }
......
...@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
DTYPE v = 1.0F/(float)sqrt((float)eSize); DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRand(-v, v); w.SetDataRandn(0, v);
/* create the positional embedding matrix */ /* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength); MakePosEmbedding(eSize, d, maxLength);
} }
/* /*
make positional embeddings (of size eSize * length make positional embeddings (of size eSize * length)
eSize - embedding size >> eSize - embedding size
length - length of the sequenc >> d - dimension size of the hidden layers
>> length - length of the sequence
*/ */
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{ {
...@@ -114,15 +115,15 @@ make the network ...@@ -114,15 +115,15 @@ make the network
*/ */
XTensor T2TEmbedder::Make(XTensor &input) XTensor T2TEmbedder::Make(XTensor &input)
{ {
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!"); //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!"); CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!"); CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\""); CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int)); memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize; dims[input.order] = eSize;
XTensor wordEmbedding; XTensor wordEmbedding;
XTensor posEmbedding; XTensor posEmbedding;
...@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */ /* we make positional embeddings first */
//if(!match){ //if(!match){
if(true){ if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem); XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0); _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
...@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
} }
/* then we make word embeddings */ /* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize)); //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */ /* we sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding + posEmbedding;
......
...@@ -31,6 +31,10 @@ namespace transformer ...@@ -31,6 +31,10 @@ namespace transformer
/* constructor */ /* constructor */
AttEncoder::AttEncoder() AttEncoder::AttEncoder()
{ {
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
} }
/* de-constructor */ /* de-constructor */
......
...@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor1D(&w, d, X_FLOAT, devID, mem); InitTensor1D(&w, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem); InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F; w.SetDataRand(1.0F, 1.0F);
float finfout = (float)sqrt(6.0F * scale / d);
w.SetDataRand(-finfout, finfout);
b.SetZeroAll(); b.SetZeroAll();
} }
......
...@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
LoadParamInt(argc, argv, "dev", &devID, -1); LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem); LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024); LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false); LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamBool(argc, argv, "lm", &isLM, !isMT);
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false); LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
...@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
encoder.InitModel(argc, argv, isLM, 0, devID, mem); encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem); outputLayer.InitModel(argc, argv, devID, mem);
if(isMT)
decoder.InitModel(argc, argv, true, 0, devID, mem);
XList params(10); XList params(10);
GetParams(params); GetParams(params);
...@@ -87,74 +90,161 @@ make the encoding network ...@@ -87,74 +90,161 @@ make the encoding network
>> isTraining - indicates whether we are training the model >> isTraining - indicates whether we are training the model
<< return - encoding result << return - encoding result
*/ */
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining) XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
{ {
return encoder.Make(input, mask, isTraining); return encoder.Make(input, mask, isTraining);
} }
/* /*
make the entire network (with the output softmax layer) make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
return decoder.Make(inputDec, outputEnc, mask, isTraining);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor >> input - input tensor
>> output - output tensor (distribution) >> output - output tensor (distribution)
>> padding - padding of the sequences >> padding - padding of the sequences
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining) void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{ {
XTensor encoding; XTensor encoding;
if(isLM){ /* generate mask to see "previous" words only */
/* generate mask to see "previous" words only */ //int len = input.GetDim(input.order - 2);
int len = input.GetDim(input.order - 2); //int * dims = new int[input.order + 1];
int * dims = new int[input.order + 1]; //for(int i = 0; i < input.order; i++)
for(int i = 0; i < input.order; i++) // dims[i + 1] = input.GetDim(i);
dims[i + 1] = input.GetDim(i); //dims[0] = nhead;
dims[0] = nhead; //dims[input.order] = len;
dims[input.order] = len; //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9. int * dims = new int[input.order + 2];
this matrix can be used to prevent the attention to current or following words in for(int i = 0; i < input.order; i++)
a given sequence. */ dims[i + 1] = input.GetDim(i);
_SetDataLowTri(&mask, 1e9F, 0); dims[0] = nhead;
_ScaleAndShiftMe(&mask, 1.0F, -1e9F); dims[input.order + 1] = len;
XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++) /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
dimsPadding[i] = padding.GetDim(i); this matrix can be used to prevent the attention to current or following words in
dimsPadding[padding.order - 1] = padding.GetDim(-1); a given sequence. */
dimsPadding[padding.order] = padding.GetDim(-1); _SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* mask of the padding */ int * dimsPadding = new int[padding.order + 2];
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1)); for(int i = 0; i < padding.order - 1; i++)
_Unsqueeze(padding2, padding3, 0, nhead); dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.denseRatio, padding.devID, padding.mem);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining);
outputLayer.Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
_ScaleAndShiftMe(padding3, 1e9F, -1e9F); //DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
/* generate mask to see "previous" words on the decoder side */
int len = inputDec.GetDim(inputDec.order - 2);
int * dims = new int[inputDec.order + 1];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order] = len;
InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
_Sum(&mask, padding3, &mask); /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
encoding = MakeEncoding(input, mask, isTraining); /* padding on the source side */
outputLayer.Make(encoding, output); int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
delete[] dims; XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
delete[] dimsPadding; paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
DelTensorBuf(padding2); for (int i = 0; i < padding2->order; i++)
DelTensorBuf(padding3); dimsPadding[i + 1] = padding2->GetDim(i);
} dimsPadding[0] = nhead;
else{
ShowNTErrors("TODO!"); XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
} paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
outputLayer.Make(decoding, output);
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding3);
DelTensorBuf(padding2);
} }
/* /*
...@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list) ...@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder.attLayerNorms[i].w); list.Add(&encoder.attLayerNorms[i].w);
list.Add(&encoder.attLayerNorms[i].b); list.Add(&encoder.attLayerNorms[i].b);
} }
list.Add(&encoder.embedder.w); list.Add(&encoder.embedder.w);
if(isMT){
for(int i = 0; i < decoder.nlayer; i++){
list.Add(&decoder.fnns[i].w1);
list.Add(&decoder.fnns[i].b1);
list.Add(&decoder.fnns[i].w2);
list.Add(&decoder.fnns[i].b2);
list.Add(&decoder.attentionsEnde[i].wk);
list.Add(&decoder.attentionsEnde[i].wq);
list.Add(&decoder.attentionsEnde[i].wv);
list.Add(&decoder.attentionsEnde[i].wa);
list.Add(&decoder.attEndeLayerNorms[i].w);
list.Add(&decoder.attEndeLayerNorms[i].b);
list.Add(&decoder.attentions[i].wk);
list.Add(&decoder.attentions[i].wq);
list.Add(&decoder.attentions[i].wv);
list.Add(&decoder.attentions[i].wa);
list.Add(&decoder.fnnLayerNorms[i].w);
list.Add(&decoder.fnnLayerNorms[i].b);
list.Add(&decoder.attLayerNorms[i].w);
list.Add(&decoder.attLayerNorms[i].b);
}
list.Add(&decoder.embedder.w);
}
} }
/* /*
......
...@@ -69,10 +69,16 @@ public: ...@@ -69,10 +69,16 @@ public:
void InitModel(int argc, char ** argv); void InitModel(int argc, char ** argv);
/* make the encoding network */ /* make the encoding network */
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining); XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the entire network (with the output softmax layer) */ /* make the encoding network */
void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining); XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);
/* get parameter matrics */ /* get parameter matrics */
void GetParams(XList &list); void GetParams(XList &list);
......
...@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float scale = 1.0F; float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize)); float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout); w.SetDataRand(-finfout, finfout);
DTYPE v = 1.0F/(float)sqrt((float)hSize);
w.SetDataRandn(0, v);
} }
/* /*
...@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{ {
XTensor &x = input; XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1); //output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
} }
} }
...@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamInt(argc, argv, "d", &d, 512); LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000); LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1); LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false); LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000); LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
LoadParamBool(argc, argv, "adam", &useAdam, false); LoadParamBool(argc, argv, "adam", &useAdam, false);
...@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false); LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, argv, "updatestep", &updateStep, 1); LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false); LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, false);
buf = new int[bufSize]; buf = new int[bufSize];
buf2 = new int[bufSize]; buf2 = new int[bufSize];
...@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
adamBeta1T = 1.0F; adamBeta1T = 1.0F;
adamBeta2T = 1.0F; adamBeta2T = 1.0F;
validStep = 0;
curEpoch = 0;
} }
int tc = 0; int tc = 0;
...@@ -133,9 +138,10 @@ train the model ...@@ -133,9 +138,10 @@ train the model
>> modelFN - where we keep the model >> modelFN - where we keep the model
>> model - model to train >> model - model to train
*/ */
void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model) bool T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
{ {
int epoch = 0; curEpoch += 1;
int step = 0; int step = 0;
int wc = 0; int wc = 0;
int wordCount = 0; int wordCount = 0;
...@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int nCheckpoint = 0; int nCheckpoint = 0;
int nSkipped = 0; int nSkipped = 0;
int gradStep = 0; int gradStep = 0;
int validStep = 0; //int validStep = 0;
char * trainFN = new char[(int)strlen(fn) + 10]; char * trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn); strcpy(trainFN, fn);
...@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
sprintf(trainFN, "%s.random", fn); sprintf(trainFN, "%s.random", fn);
#endif #endif
PrepareModel(model);
int devID = model->devID; int devID = model->devID;
XMem * mem = model->mem; XMem * mem = model->mem;
XNet net; XNet net;
PrepareModel(model);
double startT = GetClockSec(); double startT = GetClockSec();
for(epoch = 1; epoch <= nepoch; epoch++){ //for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32 #ifndef WIN32
if(isShuffled) if(isShuffled)
Shuffle(fn, trainFN); Shuffle(fn, trainFN);
#endif #endif
FILE * file = fopen(trainFN, "rb"); FILE * file = fopen(trainFN, "rb");
...@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
wordCount = 0; wordCount = 0;
loss = 0; loss = 0;
/* batch of input sequences */ /* batch of sequences (on the encoder and decoder sides) */
XTensor batch; XTensor batchEnc;
XTensor batchDec;
/* padding */ /* padding */
XTensor padding; XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */ /* gold standard */
XTensor gold; XTensor gold;
...@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */ /* label smoothed gold standard (if needed) */
XTensor goldSmoothed; XTensor goldSmoothed;
while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) { while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, wc, devID, mem, true))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
/* make the network */ /* make the network */
model->Make(batch, output, padding, true); if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, true);
else{
ShowNTErrors("Illegal model type!");
}
/* back-propagation for obtaining gradients */ /* back-propagation for obtaining gradients */
if (labelSmoothingP > 0) if (labelSmoothingP > 0)
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP); LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */ /* make paddings for the output */
if (output.GetDim(0) > 1) //if (output.GetDim(0) > 1)
PadOutput(&output, &gold, &padding); // PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
//fflush(tmpFILE);
/* get probabilities */ /* get probabilities */
float prob = GetProb(&output, &gold, NULL); float prob = GetProb(&output, &gold, NULL);
DTYPE lossLocal = -prob / wc; DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
...@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (doUpdate) { if (doUpdate) {
/* recale the output for normalized loss */ /* recale the output for normalized loss */
RescaleOutput(&output, &g, &padding); //RescaleOutput(&output, &g, &paddingDec);
/* back-propagation */ /* back-propagation */
net.Backward(output, g, CROSSENTROPY); net.Backward(output, g, paddingDec, CROSSENTROPY);
/*for(int i = 0; i < net.nodes.count; i++){
XTensor * node = (XTensor*)net.nodes.Get(i);
XLink::ShowNode(stderr, node);
}
exit(0);*/
gradStep += 1; gradStep += 1;
loss += -prob; loss += -prob;
wordCount += wc; wordCount += wc;
...@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break; break;
} }
if (step % 1 == 0) { if (step % 100 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc)); lr, elapsed, step, curEpoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
if (!doUpdate) if (!doUpdate)
XPRINT(0, stderr, " (no update)"); XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n"); XPRINT(0, stderr, "\n");
...@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
fclose(file); fclose(file);
if (isEnd) if (isEnd)
break; return false;
return true;
if(useEpochCheckpoint) //if(useEpochCheckpoint)
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch); // MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
} //}
double elapsed = GetClockSec() - startT; //double elapsed = GetClockSec() - startT;
//
epoch = MIN(epoch, nepoch); //epoch = MIN(epoch, nepoch);
//
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n", //XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount)); // lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n", //XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
elapsed, step, nSkipped, epoch); // elapsed, step, nSkipped, epoch);
delete[] trainFN; delete[] trainFN;
} }
...@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
wordCount = 0; wordCount = 0;
/* batch of input sequences */ /* batch of input sequences */
XTensor batch; XTensor batchEnc;
XTensor batchDec;
/* padding */ /* padding */
XTensor padding; XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */ /* gold standard */
XTensor gold; XTensor gold;
...@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf(); ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){ while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold,
seqs, vSize, vSizeTgt,
1, 1, false, wc, devID, mem, false))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch"); //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
/* make the network */ /* make the network */
model->Make(batch, output, padding, false); if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, false);
else{
ShowNTErrors("Illegal model type!");
}
int bSize = batch.GetDim(0); int bSize = output.GetDim(0);
int length = batch.GetDim(1); int length = output.GetDim(1);
/* prediction probabilities */ /* prediction probabilities */
XTensor probs; XTensor probs;
...@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
delete[] seqs; delete[] seqs;
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n", XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss / wordCount)); elapsed,wordCountTotal, exp(loss / wordCount));
} }
...@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
/* sort the sequences by length */ /* sort the sequences by length */
if (isSorted) { if (isSorted) {
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
SampleNode * nodes = new SampleNode[seqCount]; SampleNode * nodes = new SampleNode[seqCount];
int count = 0; int count = 0;
int offset = 0; int offset = 0;
...@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
offset += node.size; offset += node.size;
} }
qsort(nodes, seqCount, sizeof(SampleNode), CompareSampleNode); qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
count = 0; count = 0;
offset = 0; offset = 0;
for(int i = 0; i < seqCount; i++){ for(int i = 0; i < seqCount; i += step){
SampleNode &node = nodes[count]; SampleNode &node = nodes[count];
//fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
memcpy(buf2 + offset, node.p, sizeof(int) * node.size); memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for(int j = 0; j < step; j++){ for(int j = 0; j < step; j++){
seqLen2[count + j] = seqLen[node.id + j]; seqLen2[i + j] = seqLen[node.id + j];
seqOffset[count + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0); seqOffset[i + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
} }
count += step; count += 1;
offset += node.size; offset += node.size;
} }
...@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
buf = buf2; buf = buf2;
buf2 = tmp; buf2 = tmp;
tmp = seqLen; tmp = seqLen;
seqLen = seqLen2; seqLen = seqLen2;
seqLen2 = tmp; seqLen2 = tmp;
...@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf() ...@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf()
nextSeq = -1; nextSeq = -1;
} }
/* /*
load a batch of sequences load a batch of sequences
>> file - the handle to the data file >> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms >> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences >> batchEnc - the batch of the input sequences
>> padding - padding of the input sequences >> paddingEnc - padding of the input sequences
>> output - the batch of the output sequences >> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array >> seqs - keep the sequences in an array
>> step - the step we go over when move to the next sequence >> vsEnc - size of the encoder vocabulary
>> vs - vocabulary size >> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences >> sBatch - batch size of sequences
>> wBatch - batch size of words >> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length >> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count >> wCount - word count
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
>> isTraining - indicates whether we are training the model
*/ */
int T2TTrainer::LoadBatch(FILE * file, bool isLM, int T2TTrainer::LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int * seqs,
int step, int vs, int sBatch, int wBatch, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &wCount,
int devID, XMem * mem) int devID, XMem * mem,
bool isTraining)
{
if(isLM){
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
else{
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{ {
if(nextSeq < 0 || nextSeq >= nseqBuf) if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, step); LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0); int seq = MAX(nextSeq, 0);
int wc = 0; int wc = 0;
...@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(max < wn) if(max < wn)
max = wn; max = wn;
if(sc >= sBatch && wc >= wBatch) int tc = isSmallBatch ? max * sc : wc;
if(sc >= sBatch && tc >= wBatch)
break; break;
} }
...@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(sc <= 0) if(sc <= 0)
return 0; return 0;
if(isLM){ int dims[MAX_TENSOR_DIM_NUM];
int dims[MAX_TENSOR_DIM_NUM]; dims[0] = sc;
dims[0] = sc; dims[1] = max;
dims[1] = max; dims[2] = vs;
dims[2] = vs;
InitTensor(batchEnc, 2, dims, X_INT, 1.0F, -1);
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem); //InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem); InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
if(batch->grad == NULL)
XNoder::MakeGrad(batch); batchEnc->SetZeroAll();
else paddingEnc->SetZeroAll();
InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem); gold->SetZeroAll();
paddingDec->SetZeroAll();
if(padding->grad == NULL)
XNoder::MakeGrad(padding); if(isTraining) {
else //XNoder::MakeGrad(batchEnc);
InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem); XNoder::MakeGrad(paddingEnc);
XNoder::MakeGrad(gold);
if(output->grad == NULL) XNoder::MakeGrad(paddingDec);
XNoder::MakeGrad(output); //batchEnc->grad->SetZeroAll();
else paddingEnc->grad->SetZeroAll();
InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem); gold->grad->SetZeroAll();
paddingDec->grad->SetZeroAll();
batch->SetZeroAll(); }
padding->SetZeroAll();
output->SetZeroAll(); int seqSize = 0;
batch->grad->SetZeroAll();
padding->grad->SetZeroAll(); //fprintf(tf, "batch %d(%d)\n", tc++, sc);
output->grad->SetZeroAll();
/* this might be slow on GPUs :( */
int seqSize = 0; for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
//fprintf(tf, "batch %d(%d)\n", tc++, sc); CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
/* this might be slow on GPUs :( */ batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
for(int s = seq; s < seq + sc; s++){ //batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1; paddingEnc->Set2D(1.0F, s - seq, w);
CheckNTErrors(len <= max, "Something is wrong!"); paddingDec->Set2D(1.0F, s - seq, w);
for(int w = 0; w < len; w++){ if (w > 0)
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]); gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
padding->Set2D(1.0F, s - seq, w);
if(w > 0) if (w == len - 1) {
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]); if (isDoubledEnd)
if(w == len - 1){ gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
if(isDoubledEnd)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else else
fprintf(tf, "\n");*/ gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
} }
if(seqs != NULL){ wCount++;
for(int w = len; w < max; w++) /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
seqs[seqSize++] = -1; if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if(seqs != NULL){
for(int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
}
fflush(tf);
return sc;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
/* target-side sequence */
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
wnDec = len;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
int tc = isSmallBatch ? maxEnc * sc / 2 : wcEnc;
if(sc >= sBatch * 2 && tc >= wBatch)
break;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;
int sCount = sc/2;
int seqSize = 0;
int dimsEnc[3] = {sCount, maxEnc, vsEnc};
int dimsDec[3] = {sCount, maxDec, vsDec};
InitTensor(batchEnc, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
InitTensor(batchDec, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
gold->SetZeroAll();
wCount = 0;
/* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq)/2;
for(int w = 0; w < len; w++){
batchEnc->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
paddingEnc->Set2D(1.0F, sent, w);
wCount++;
}
}
/* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= maxDec, "Something is wrong!");
int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){
paddingDec->Set2D(1.0F, sent, w);
batchDec->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
if(w > 0)
gold->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
if (w == len - 1) {
if(isDoubledEnd)
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
else
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w + 1]);
} }
wCount++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
} }
fflush(tf); if(seqs != NULL){
for(int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
} }
return sc; return sc;
...@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs; XTensor probs;
InitTensor(&probs, output); InitTensor(&probs, output);
XTensor logOutput;
InitTensor(&logOutput, output);
_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */ /* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(output, gold, &probs); _Multiply(&logOutput, gold, &probs);
/* probability of each word */ /* probability of each word */
XTensor wprobs; XTensor wprobs;
...@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
_CopyValues(&wprobs, wordProbs); _CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure /* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */ TODO: XTensor supports scalars */
dims[0] = 1; dims[0] = 1;
dims[1] = probs.unitNum; dims[1] = probs.unitNum;
probs.Reshape(2, dims); probs.Reshape(2, dims);
...@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi ...@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
{ {
CheckNTErrors(output->order == 3, "Wrong dimension number!"); CheckNTErrors(output->order == 3, "Wrong dimension number!");
CheckNTErrors(gold->order == 3, "Wrong dimension number!"); CheckNTErrors(gold->order == 3, "Wrong dimension number!");
int num = padding->GetDim(0); DTYPE count = _ReduceSumAll(padding);
XTensor * factor = NewTensorBuf(1, &num, padding->dataType, 1.0F, padding->devID, padding->mem);
_ReduceSum(padding, factor, padding->order - 1);
_ExpMe(output); _ExpMe(output);
_DivDim(output, factor, output, 0); _ScaleAndShiftMe(output, 1/count);
_LogMe(output); _LogMe(output);
_DivDim(gold, factor, gold, 0); _ScaleAndShiftMe(gold, 1/count);
DelTensorBuf(factor);
} }
/* /*
......
...@@ -79,6 +79,9 @@ public: ...@@ -79,6 +79,9 @@ public:
/* vocabulary size of the source side */ /* vocabulary size of the source side */
int vSize; int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */ /* learning rate */
float lrate; float lrate;
...@@ -100,6 +103,10 @@ public: ...@@ -100,6 +103,10 @@ public:
/* indicates whether we use adam */ /* indicates whether we use adam */
bool useAdam; bool useAdam;
int validStep;
int curEpoch;
/* hyper parameters of adam*/ /* hyper parameters of adam*/
float adamBeta1; float adamBeta1;
float adamBeta2; float adamBeta2;
...@@ -128,8 +135,13 @@ public: ...@@ -128,8 +135,13 @@ public:
/* number of batches on which we do model update */ /* number of batches on which we do model update */
int updateStep; int updateStep;
/* indicates whether we double the </s> symble for the output of lms */ /* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd; bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
public: public:
/* constructor */ /* constructor */
...@@ -142,7 +154,7 @@ public: ...@@ -142,7 +154,7 @@ public:
void Init(int argc, char ** argv); void Init(int argc, char ** argv);
/* train the model */ /* train the model */
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model); bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */ /* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model); void Test(const char * fn, const char * ofn, T2TModel * model);
...@@ -158,11 +170,34 @@ public: ...@@ -158,11 +170,34 @@ public:
/* load a batch of sequences */ /* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM, int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int * seqs,
int step, int vs, int sBatch, int wBatch, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &wCount,
int devID, XMem * mem); int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */ /* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile); void Shuffle(const char * srcFile, const char * tgtFile);
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TTrainer.h" #include "T2TTrainer.h"
#include "../../tensor/XDevice.h" #include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer namespace transformer
{ {
...@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv) ...@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, "")) {
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); double startT = GetClockSec();
T2TTrainer trainer;
trainer.Init(argc, args);
char * fn = new char[MAX_LINE_LENGTH];
char * fn1 = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
int epoch;
bool isTrain;
for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
if(epoch == 1) {
T2TModel model;
model.InitModel(argc, args);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
else {
T2TModel model;
model.InitModel(argc, args);
model.Read(fn);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
T2TTrainer tester;
tester.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
model.Read(fn1);
tester.Test(testFN, fn2, &model);
}
if(!isTrain)
break;
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, trainer.nepoch);
XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);
delete[] fn;
delete[] fn1;
delete[] fn2;
}
/* don't dump the final model */
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN); // model.Dump(modelFN);
T2TModel model;
model.InitModel(argc, args);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) if(strcmp(modelFN, ""))
......
...@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int ...@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x"); CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] && CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] &&
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size."); gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
blockSize[0] = bXSize; blockSize[0] = bXSize;
blockSize[1] = bYSize; blockSize[1] = bYSize;
......
...@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation) ...@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
if(!myIsForComputation && devID >= 0 && cublasHandle != NULL) if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
cublasDestroy(cublasHandle); cublasDestroy(cublasHandle);
if(myIsForComputation) if(myIsForComputation)
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle."); CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
SetDevice(devIDBackup); SetDevice(devIDBackup);
#endif #endif
...@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle() ...@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
"Cannot destroy the cublas handle."); "Cannot destroy the cublas handle.");
} }
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle."); "Cannot create the cublas handle.");
#endif #endif
} }
......
...@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i) ...@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
/* /*
set the value of a cell set the value of a cell
>> value - value to assign to the cell >> value - value we tend to set
>> index - index of the cell for each dimension >> index - index of the cell for each dimension
>> >> size - size of the index
*/ */
bool XTensor::Set(DTYPE value, int index[], int size) bool XTensor::Set(DTYPE value, int index[], int size)
{ {
...@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size) ...@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
/* /*
set the value of a cell in a 1d tensor set the value of a cell in a 1d tensor
>> value - value to assign to the cell >> value - value we tend to set
>> i - item offset >> i - item offset
<< return - succeeded or not
*/ */
bool XTensor::Set1D(DTYPE value, int i) bool XTensor::Set1D(DTYPE value, int i)
{ {
...@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2) ...@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
return SetToDevice(devID, GetCell(dims, 3), value); return SetToDevice(devID, GetCell(dims, 3), value);
} }
/*
set the integer value of a cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>> size - size of the index
<< return - succeeded or not
*/
bool XTensor::SetInt(int value, int index[], int size)
{
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
return SetToDeviceInt(devID, GetCell(index, size), value);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1DInt(int value, int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[1] = {i};
return SetToDeviceInt(devID, GetCell(dims, 1), value);
}
/*
set the integer value of a cell in a 2d tensor in default type
>> value - value we tend to set
>> ni - row index
>> mi - column index
<< return - succeeded or not
*/
bool XTensor::Set2DInt(int value, int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[2] = {ni, mi};
return SetToDeviceInt(devID, GetCell(dims, 2), value);
}
/*
set the integer value of a cell in a 3d tensor in default type
>> value - value we tend to set
>> d0 - index of demension 0
>> d1 - index of demension 1
>> d2 - index of demension 2
<< return - succeeded or not
*/
bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[3] = {d0, d1, d2};
return SetToDeviceInt(devID, GetCell(dims, 3), value);
}
/* /*
increase the value of a cell in a 2d tensor increase the value of a cell in a 2d tensor
>> value - value we tend to set >> value - value we tend to set
...@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, ...@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem); XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
}
if(myMem != NULL) if(myMem != NULL)
tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize); tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
else else
......
...@@ -326,6 +326,18 @@ public: ...@@ -326,6 +326,18 @@ public:
/* set the value of a cell in a 3d tensor */ /* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2); bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
/* set the integer value of a cell in a 2d tensor */
bool Set2DInt(int value, int ni, int mi);
/* set the integer value of a cell in a 3d tensor */
bool Set3DInt(int value, int d0, int d1, int d2);
/* increase the value of a cell in a 2d */ /* increase the value of a cell in a 2d */
bool Add2D(DTYPE value, int ni, int mi); bool Add2D(DTYPE value, int ni, int mi);
......
...@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value) ...@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return true; return true;
} }
/* assign a integer number to a variable that is kept on a specified device */
bool SetToDeviceInt(int devID, void * p, int value)
{
if(p == NULL)
return false;
if(devID < 0)
*(int*)p = value;
else{
XMemCopy(p, devID, &value, -1, sizeof(int));
}
return true;
}
/* get the next number with power of 2 */ /* get the next number with power of 2 */
unsigned int GetNextPower2(unsigned int n) unsigned int GetNextPower2(unsigned int n)
{ {
......
...@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p); ...@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value); extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value); extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value); extern bool SetToDevice(int devID, void * p, DTYPE value);
extern bool SetToDeviceInt(int devID, void * p, int value);
extern unsigned int GetNextPower2(unsigned int n); extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime); extern void XSleep(int sleepTime);
extern double GetClock(); extern double GetClock();
......
...@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize; fanOut = numOutputFmaps * receptiveFieldSize;
} }
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut)); DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0) * std; tensor->SetDataRand(-finfout, finfout);
_SetDataRand(tensor, -a, a); //_SetDataRand(tensor, -finfout, finfout);
} }
/* /*
...@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if(tensor == NULL) if(tensor == NULL)
return; return;
/* GPU code */ /* CPU code */
if(tensor->devID < 0){ if(tensor->devID < 0){
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include "Gather.h" #include "Gather.h"
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "../../XUtility.h"
#include "../shape/Reshape.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize) ...@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
return result; return result;
} }
/*
gather indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor(2D)
>> index - the index tensor
<< return - the result of copying indexed sub-tensors
*/
XTensor Gather(const XTensor &s, const XTensor &index)
{
int indexSize = index.unitNum;
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int * srcIndex = new int[index.unitNum];
if(index.dataType == X_INT) {
XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
}
else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
DTYPE * tmp = new DTYPE[indexSize];
XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
for(int i = 0; i < indexSize; i++)
srcIndex[i] = (int)tmp[i];
delete[] tmp;
}
XTensor tensor;
tensor = Gather(s, 0, srcIndex, indexSize);
delete[] srcIndex;
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = tensor.GetDim(-1);
XTensor t;
t = Reshape(tensor, index.order + 1, dims);
delete[] dims;
return t;
}
else {
return tensor;
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS ...@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize); XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, const XTensor &index);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __GATHER_H__ #endif // __GATHER_H__
\ No newline at end of file
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include <math.h> #include <math.h>
#include "ReduceSum.h" #include "ReduceSum.h"
......
...@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.x * blockDim.y + threadIdx.y; int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i >= stride * blockNum) if(i >= stride * blockNum)
return; return;
if(threadIdx.y == 0) if(threadIdx.x == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0; bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads(); __syncthreads();
...@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
int iOffset = i % stride; int iOffset = i % stride;
bool isValid = (i < stride * blockNum && j < strideNum); bool isValid = (i < stride * blockNum && j < strideNum);
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.x] : 0; DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;
if(power != (DTYPE)1.0){ if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0) if(power == (DTYPE)2.0)
...@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
value = exp(value); value = exp(value);
/* load data into the shared mem */ /* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; iData[threadIdx.y * blockDim.x + threadIdx.x] = value;
__syncthreads(); __syncthreads();
/* do reduction in shared mem */ /* do reduction in shared mem */
for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){ for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (threadIdx.y < s) if (threadIdx.x < s)
iData[idx] += iData[idx + s]; iData[idx] += iData[idx + s];
__syncthreads(); __syncthreads();
} }
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
} }
/* /*
...@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.y; unsigned int tid = threadIdx.x;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y; unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
if(i >= stride * blockNum) if(i >= stride * blockNum)
return; return;
if (threadIdx.y == 0) if (threadIdx.x == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0; bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads(); __syncthreads();
...@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
int iOffset = i % stride; int iOffset = i % stride;
bool isValid = j < strideNum; bool isValid = j < strideNum;
bool isValid2 = j + blockDim.y < strideNum; bool isValid2 = j + blockDim.x < strideNum;
DTYPE * data = iData + threadIdx.x * blockDim.y; DTYPE * data = iData + threadIdx.y * blockDim.x;
DTYPE * inputData = input + k * blockSize; DTYPE * inputData = input + k * blockSize;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.x]: 0; DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.y) * stride + iOffset] - bias[threadIdx.x]: 0; DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
if(power != (DTYPE)1.0){ if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0){ if(power == (DTYPE)2.0){
value = value * value; value = value * value;
value2 = value2 *value2; value2 = value2 * value2;
} }
else if(power == (DTYPE)0.5){ else if(power == (DTYPE)0.5){
value = sqrt(value); value = sqrt(value);
...@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
} }
value = value + value2; value = value + value2;
__syncthreads(); __syncthreads();
value = shflDownReduceSum(value); value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; } if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads(); __syncthreads();
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.x / 32)
value = data[tid]; value = data[tid];
else value = 0; else
value = shflDownReduceSum(value); value = 0;
if (tid == 0 && blockIdx.y < reducedStrideNum) value = shflDownReduceSum(value);
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
if (tid == 0 && blockIdx.x < reducedStrideNum) {
output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
}
} }
} }
...@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output, ...@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce if data storage is discontinuius ,use this way to reduce
*/ */
__global__ __global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum, void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int blockNum, DTYPE * shift, DTYPE power, bool isExp) int blockNum, DTYPE * shift, DTYPE power, bool isExp)
{ {
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output, ...@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.y / 32)
threadSum = data[tid]; threadSum = data[tid];
else threadSum = 0; else
threadSum = 0;
threadSum = shflDownReduceSum(threadSum); threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum) if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
...@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long ...@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
/* /*
this situation we use block.x * grid.x deal one vector for continuous read this situation we use block.x * grid.x deal one vector for continuous read
*/ */
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum) void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
{ {
block.x = 512; block->x = 512;
block.y = 1; block->y = 1;
if ((stride * blockNum) % 512 == 0) if ((stride * blockNum) % 512 == 0)
grid.x = (stride * blockNum) / 512; grid->x = (stride * blockNum) / 512;
else else
grid.x = (stride * blockNum) / 512 + 1; grid->x = (stride * blockNum) / 512 + 1;
grid.y = 1; grid->y = 1;
} }
/* /*
adjust threads.x number then we can use warp optimization adjust threads.x number then we can use warp optimization
*/ */
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads) void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
{ {
if (threads.x > 1){ if (threads->y > 1){
blocks.x *= threads.x; blocks->y *= threads->y;
threads.x = 1; threads->y = 1;
} }
if (threads.y < 32) if (threads->x < 32)
threads.y = 32; threads->x = 32;
} }
/* /*
...@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
DTYPE * buf1 = buf; DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL; DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup; int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
...@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else { else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1; if (blockNum % 4 != 0)
else blockNum = blockNum / 4; blockNum = (int)(blockNum / 4) + 1;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp); else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
} }
} }
else if (stride != 1 && stride * blockNum > 4096){ else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize); //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data; //unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride); //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block; dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum); discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp); strideNum, blockNum,sp, power, isExp);
} }
...@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
/* unroll the reduction procedure. The code is messy but it is faster. */ /* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum <= 32) { if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 128) { else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 256) { else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 512) { else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else { else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
} }
......
...@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!) ...@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << return - the total summation
*/ */
DTYPE _ReduceSumAll(XTensor * source) DTYPE _ReduceSumAll(const XTensor * source)
{ {
int order = source->order; int order = source->order;
DTYPE summation; DTYPE summation;
XTensor * big = NewTensor(source); XTensor * big = NewTensor(source);
_CopyValues(source, big); _CopyValues(source, big);
for(int i = 0; i < order; i++) { for(int i = order - 1; i >= 0; i--) {
if(i == 0)
if(i == order - 1) big->Reshape(1, big->unitNum);
big->Reshape(big->unitNum, 1);
int leadingDim = big->order - 1;
int * dimSize; int * dimSize;
dimSize = getDimSize(big, 0); dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
_ReduceSum(big, little, 0); _ReduceSum(big, little, leadingDim);
delete big; delete big;
delete dimSize; delete dimSize;
...@@ -81,7 +82,7 @@ sum all the items of the tensor ...@@ -81,7 +82,7 @@ sum all the items of the tensor
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << return - the total summation
*/ */
DTYPE ReduceSumAll(XTensor & source) DTYPE ReduceSumAll(const XTensor & source)
{ {
return _ReduceSumAll(&source); return _ReduceSumAll(&source);
} }
......
...@@ -28,10 +28,10 @@ ...@@ -28,10 +28,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE _ReduceSumAll(XTensor * source); DTYPE _ReduceSumAll(const XTensor * source);
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE ReduceSumAll(XTensor & source); DTYPE ReduceSumAll(const XTensor & source);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int unitNum = output->dimSize[n]; int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!"); CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!"); CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!"); CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem); XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem); XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
/* l = log(output) */
_Log(output, logBuf);
if(weight != NULL){
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
}
/* negate result n = negate(mul) */
_NegateMe(mulBuf);
_ReduceSum(mulBuf, loss, n); _Log(output, interBuf1);
_Multiply(gold, interBuf1, interBuf2);
if(weight != NULL)
_MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DelTensorBuf(mulBuf); if(padding != NULL)
DelTensorBuf(logBuf); _MultiplyMe(loss, padding);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
} }
/* /*
...@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight, XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n); int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order, CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!"); "Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
...@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * lossData = (DTYPE*)loss->data; DTYPE * lossData = (DTYPE*)loss->data;
DTYPE tmpLoss; DTYPE tmpLoss;
int lossPos;
int goldPos;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
tmpLoss = 0;
tmpLoss = 0; lossPos = i * stride + j;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)); tmpLoss += -(*(goldData + goldPos)) *
*(lossData + i) = tmpLoss; (DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + i) == 0) if(*(paddingData + lossPos) == 0)
*(lossData + i) = 0; *(lossData + lossPos) = 0;
else{ else {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)); tmpLoss += -(*(goldData + goldPos)) *
*(lossData + i) = tmpLoss; (DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
} }
...@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
tmpLoss = 0;
tmpLoss = 0; lossPos = i * stride + j;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)) * tmpLoss += -(*(goldData + goldPos)) *
(*(weightData + j)); (DTYPE)log(*(outputData + goldPos)) *
*(lossData + i) = tmpLoss; (*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + i) == 0) if(*(paddingData + lossPos) == 0)
*(lossData + i) = 0; *(lossData + lossPos) = 0;
else{ else {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)) * tmpLoss += -(*(goldData + goldPos)) *
(*(weightData + j)); (DTYPE)log(*(outputData + goldPos)) *
*(lossData + i) = tmpLoss; (*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
} }
...@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
} }
/* /*
get the dimSize after reduce operation
>> tensor - a tensor to be reduced
>> n - the reduce dimension
<< return - the pointer of dimSize
*/
int * reduceDimSize(const XTensor * tensor, int n)
{
int order = tensor->order;
int * dimSize = new int[order - 1];
for (int i = 0; i < order; i++) {
if(i < n)
dimSize[i] = tensor->dimSize[i];
else if(i > n)
dimSize[i - 1] = tensor->dimSize[i];
}
return dimSize;
}
/*
compute the cross entropy loss compute the cross entropy loss
loss = sum_{i} (-gold_i * log(output_i)) loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions where gold and output are distributions
...@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
DTYPE loss = 0;
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int unitNum = output->dimSize[n]; int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!"); CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!"); CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem); int * dimSize = new int[order - 1];
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem); for (int i = 0; i < order; i++) {
if(i < n)
/* l = log(output) */ dimSize[i] = output->dimSize[i];
_Log(output, logBuf); else if(i > n)
dimSize[i - 1] = output->dimSize[i];
if(weight != NULL){
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
} }
/* negate multiply result n = negate(mul) */ XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
_NegateMe(mulBuf); output->devID, output->mem);
int * dimSize;
dimSize = reduceDimSize(output, n);
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
/* reduce sum all classes */
_ReduceSum(mulBuf, lossInter, n);
DelTensorBuf(mulBuf); _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
DelTensorBuf(logBuf);
DTYPE loss;
/* compute the total loss */ loss = _ReduceSumAll(lossBuf);
if(padding != NULL) {
XTensor * temp = NewTensor(lossInter);
_Multiply(lossInter, padding, temp);
loss = _ReduceSumAll(temp);
delete temp;
}
else
loss = _ReduceSumAll(lossInter);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; int nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossInter->unitNum; nonZeroNum = lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp; DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / (DTYPE)nonZeroNum;
...@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
} }
delete[] dimSize; delete[] dimSize;
delete lossInter; DelTensorBuf(lossBuf);
return loss; return loss;
} }
...@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA DTYPE loss = 0;
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
...@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * outputData = (DTYPE*)output->data; DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data; DTYPE * goldData = (DTYPE*)gold->data;
DTYPE loss = 0; int paddingPos;
int goldPos;
int nonZeroNum = 0; int nonZeroNum = 0;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = blockNum; nonZeroNum = blockNum * stride;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++) for(int i = 0; i < blockNum; i++) {
loss += -(*(goldData + beg + j)) * for(int j = 0; j < stride; j++) {
(DTYPE)log(*(outputData + beg + j)); paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0) for(int j = 0; j < stride; j++) {
continue; paddingPos = i * stride + j;
else{ if(*(paddingData + paddingPos) == 0)
nonZeroNum += 1; continue;
else {
int beg = i * blockSize; nonZeroNum += 1;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
loss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)); loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
}
} }
} }
} }
} }
else { else {
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = blockNum; nonZeroNum = blockNum * stride;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) paddingPos = i * stride + j;
loss += -(*(goldData + beg + j)) * for(int k = 0; k < leadingDimSize; k++) {
(DTYPE)log(*(outputData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(*(weightData + j)); loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0) for(int j = 0; j < stride; j++) {
continue; paddingPos = i * stride + j;
else{ if(*(paddingData + paddingPos) == 0)
nonZeroNum += 1; continue;
else {
int beg = i * blockSize; nonZeroNum += 1;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
loss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)) * loss += -(*(goldData + goldPos)) *
(*(weightData + j)); (DTYPE)log(*(outputData + goldPos)) *
(*(weightData + j));
}
}
} }
} }
} }
} }
...@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output ...@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
>> padding - specify a target value that is ignored and does not contribute to the loss computation >> padding - specify a target value that is ignored and does not contribute to the loss computation
>> leadingDim - the leading dimension for the output >> leadingDim - the leading dimension for the output
*/ */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * weight, const XTensor * padding, const XTensor * gold, const XTensor * weight,
int leadingDim) XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n); int leadingDimSize = output->GetDim(n);
...@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
"Wrong padding tensor!"); "Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * outputData = (DTYPE*)output->data; DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data; DTYPE * goldData = (DTYPE*)gold->data;
int paddingPos;
int goldPos;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
*(dedyData + beg + j) = -(*(goldData + beg + j)) / goldPos = i * blockSize + j + k * stride;
(*(outputData + beg + j)); *(dedyData + goldPos) = -(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
if(*(paddingData + i) == 0) paddingPos = i * stride + j;
memset(dedyData + beg, 0, blockSize * unitSize); for(int k = 0; k < leadingDimSize; k++) {
else goldPos = i * blockSize + j + k * stride;
for(int j = 0; j < blockSize; j++) if(*(paddingData + paddingPos) == 0)
*(dedyData + beg + j) = -(*(goldData + beg + j)) / *(dedyData + goldPos) = 0;
(*(outputData + beg + j)); else
*(dedyData + goldPos) = -(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
} }
} }
} }
...@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
*(dedyData + beg + j) = -(*(weightData + j)) * goldPos = i * blockSize + j + k * stride;
(*(goldData + beg + j)) / *(dedyData + goldPos) = -(*(weightData + k)) *
(*(outputData + beg + j)); (*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
if(*(paddingData + i) == 0) paddingPos = i * stride + j;
memset(dedyData + beg, 0, blockSize * unitSize); for(int k = 0; k < leadingDimSize; k++) {
else goldPos = i * blockSize + j + k * stride;
for(int j = 0; j < blockSize; j++) { if(*(paddingData + paddingPos) == 0)
*(dedyData + beg + j) = -(*(weightData + j)) * *(dedyData + goldPos) = 0;
(*(goldData + beg + j)) / else
(*(outputData + beg + j)); *(dedyData + goldPos) = -(*(weightData + k)) *
(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
} }
} }
} }
} }
if(padding != NULL) { //if(padding != NULL) {
XTensor * tmp = NewTensor(padding); // XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp); // _IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp); // int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp; // delete tmp;
} //}
else { //else {
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum); // _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
} //}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -26,80 +26,20 @@ ...@@ -26,80 +26,20 @@
#include "../XDevice.h" #include "../XDevice.h"
#include "CrossEntropy.cuh" #include "CrossEntropy.cuh"
#include "CrossEntropy.h" #include "CrossEntropy.h"
#include "../core/reduce/ReduceSumAll.h" #include "../core/arithmetic/Div.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h" #include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/shape/Transpose.h"
#include "../core/shape/Unsqueeze.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
compute the cross entropy loss (cuda kernel)
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> lossData - the data pointer of loss tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> stride - the size of a data block
*/
__global__
void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
DTYPE * lossData, DTYPE * weightData,
DTYPE * paddingData, int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
DTYPE tmpLoss = 0;
if(weightData == NULL) {
if(paddingData == NULL) {
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
}
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
}
}
}
/*
compute the cross entropy loss (cuda version) compute the cross entropy loss (cuda version)
loss = sum_{i} (-gold_i * log(output_i)) loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions where gold and output are distributions
...@@ -112,79 +52,27 @@ where gold and output are distributions ...@@ -112,79 +52,27 @@ where gold and output are distributions
>> leadingDim - the leading dimension for the output >> leadingDim - the leading dimension for the output
*/ */
void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight, XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1,
"Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize; XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
blockNum = output->unitNum / blockSize; XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
int cudaGrids[3];
int cudaBlocks[3];
//GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); _Log(output, interBuf1);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); _Multiply(gold, interBuf1, interBuf2);
int devIDBackup; if(weight != NULL)
ProtectCudaDev(output->devID, devIDBackup); _MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DTYPE * outputData = (DTYPE*)output->data; if(padding != NULL)
DTYPE * goldData = (DTYPE*)gold->data; _MultiplyMe(loss, padding);
DTYPE * lossData = (DTYPE*)loss->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
BacktoCudaDev(output->devID, devIDBackup);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
} }
/* /*
...@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i]; dimSize[i - 1] = output->dimSize[i];
} }
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem); XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
_CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim); _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossInter); loss = _ReduceSumAll(lossBuf);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; int nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossInter->unitNum; nonZeroNum = lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp; DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / (DTYPE)nonZeroNum;
} }
else if(reduceWay == REDUCE_SUM) {
return loss; /* don't need to do anything */
}
/*
backward computation of cross entropy function (kernel version)
>> dedyData - the data pointer of dedy tensor
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> blockSize - the size of a data block
*/
__global__
void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
DTYPE * weightData, DTYPE * paddingData,
int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
if(weightData == NULL) {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
} }
else { else {
if(paddingData == NULL) { ShowNTErrors("TODO");
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++) {
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
} }
delete[] dimSize;
DelTensorBuf(lossBuf);
return loss;
} }
/* /*
...@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output ...@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
*/ */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight, const XTensor * gold, const XTensor * weight,
const XTensor * padding, int leadingDim) XTensor * padding, int leadingDim)
{ {
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]); _Div(gold, output, dedy);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); _NegateMe(dedy);
if(weight != NULL)
int devIDBackup; _MultiplyDimMe(dedy, weight, n);
ProtectCudaDev(output->devID, devIDBackup);
DTYPE * dedyData = (DTYPE*)dedy->data;
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
if(padding != NULL) { if(padding != NULL) {
XTensor * tmp = NewTensor(padding); int paddingOrder = padding->order;
_IsNonZero(padding, tmp); int * paddingDims = new int[paddingOrder];
int nonZeroNum = (int)_ReduceSumAll(tmp); memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); padding->Reshape(padding->unitNum);
delete tmp;
} int order = dedy->order;
else { int * dims = new int[order];
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum); memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
_MultiplyDimMe(dedy, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedy->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
} }
BacktoCudaDev(output->devID, devIDBackup); //if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
} }
......
...@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
/* backward computation of cross entropy function */ /* backward computation of cross entropy function */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL, const XTensor * gold, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1); XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
const XTensor * padding = NULL, int leadingDim = -1); const XTensor * padding = NULL, int leadingDim = -1);
/* backward computation of cross entropy function */ /* backward computation of cross entropy function */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * weight = NULL, const XTensor * padding = NULL, const XTensor * gold, const XTensor * weight = NULL,
int leadingDim = -1); XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -279,8 +279,8 @@ better numerical stability. ...@@ -279,8 +279,8 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction) >> leadDim - leading dimension (along which we perform reduction)
*/ */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!"); CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
...@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int leadDimRDI = y->order - leadDim - 1; int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (gold->devID >= 0) { if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return; return;
} }
#endif #endif
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "LogSoftmax.h" #include "LogSoftmax.h"
#include "LogSoftmax.cuh" #include "LogSoftmax.cuh"
#include "Loss.cuh" #include "Loss.cuh"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/reduce/ReduceSum.cuh" #include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh" #include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h" #include "../XDevice.h"
...@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx ...@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
__global__ __global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName) void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
int size, LOSS_FUNCTION_NAME lossName)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -371,10 +373,12 @@ better numerical stability. ...@@ -371,10 +373,12 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction) >> leadDim - leading dimension (along which we perform reduction)
*/ */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
leadDim = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs."); CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID), CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU."); "Tensors used in log softmax are not on the same GPU.");
...@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dimensionSize * stride, lossName); dimensionSize * stride, lossName);
} }
} }
if(padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, ...@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim); ...@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */ /* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
for (int j = 0; j < stride; j++) { for (int j = 0; j < stride; j++) {
for (int k = 0; k < tLen; k++) { for (int k = 0; k < tLen; k++) {
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) =
+ j + stride * (tBeg + k)) / (DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k)); -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) /
(DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
} }
} }
} }
......
...@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...) ...@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
>> leadDim - leading dimension (along which we perform reduction) >> leadDim - leading dimension (along which we perform reduction)
*/ */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!"); CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
...@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA #ifdef USE_CUDA
if(y->devID >= 0){ if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return; return;
} }
#endif #endif
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "Loss.cuh" #include "Loss.cuh"
#include "../core/reduce/ReduceSum.h" #include "../core/reduce/ReduceSum.h"
#include "../core/arithmetic/Multiply.h" #include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/shape/Unsqueeze.h" #include "../core/shape/Unsqueeze.h"
#include "../core/arithmetic/Sum.h" #include "../core/arithmetic/Sum.h"
#include "../XDevice.h" #include "../XDevice.h"
...@@ -309,9 +310,11 @@ See more details in SoftmaxBackward ...@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
*/ */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
int n = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs."); CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU."); CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((y->order >= 1), "Empty tensor!"); CheckNTErrors((y->order >= 1), "Empty tensor!");
...@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){ if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F); _Sum(y, gold, dedx, -1.0F);
if(padding != NULL) {
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
} }
else if(lossName == ONEHOTERROR){ else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim); ...@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */ /* de/dx */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -169,8 +169,8 @@ bool TestDropout2() ...@@ -169,8 +169,8 @@ bool TestDropout2()
_DropoutBackward(y, x, dedy, dedx, 1, dropProb); _DropoutBackward(y, x, dedy, dedx, 1, dropProb);
/* check result */ /* check result */
y->Dump(stderr, "y"); //y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy"); //dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -193,8 +193,8 @@ bool TestDropout2() ...@@ -193,8 +193,8 @@ bool TestDropout2()
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb); _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);
/* check result */ /* check result */
yGPU->Dump(stderr, "yGPU"); //yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU"); //dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */ /* destroy variables */
delete x; delete x;
......
...@@ -146,7 +146,7 @@ bool TestLogSoftmax2() ...@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
_LogSoftmax(x, y, 1); _LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY); _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -174,7 +174,7 @@ bool TestLogSoftmax2() ...@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
_LogSoftmax(xGPU, yGPU, 1); _LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY); _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F); gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
...@@ -250,7 +250,7 @@ bool TestLogSoftmax3() ...@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
_LogSoftmax(x, y, 1); _LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR); _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -278,7 +278,7 @@ bool TestLogSoftmax3() ...@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
_LogSoftmax(xGPU, yGPU, 1); _LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR); _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
...@@ -66,7 +66,9 @@ bool TestPower1() ...@@ -66,7 +66,9 @@ bool TestPower1()
bUser = Power(*a, 2.0F); bUser = Power(*a, 2.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -88,7 +90,9 @@ bool TestPower1() ...@@ -88,7 +90,9 @@ bool TestPower1()
bUserGPU = Power(*aGPU, 2.0F); bUserGPU = Power(*aGPU, 2.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -153,7 +157,9 @@ bool TestPower2() ...@@ -153,7 +157,9 @@ bool TestPower2()
bUser = Power(*a, 1.0F); bUser = Power(*a, 1.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -175,7 +181,9 @@ bool TestPower2() ...@@ -175,7 +181,9 @@ bool TestPower2()
bUserGPU = Power(*aGPU, 1.0F); bUserGPU = Power(*aGPU, 1.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -214,7 +222,7 @@ bool TestPower3() ...@@ -214,7 +222,7 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 1.0F}, DTYPE aData[3][2] = { {1.0F, 1.0F},
{2.0F, 3.0F}, {2.0F, 3.0F},
{4.0F, 5.0F} }; {4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0F, 1.0F}, DTYPE answer[3][2] = { {1.0F, 1.0F},
...@@ -240,7 +248,9 @@ bool TestPower3() ...@@ -240,7 +248,9 @@ bool TestPower3()
bUser = Power(*a, 0.0F); bUser = Power(*a, 0.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -262,7 +272,9 @@ bool TestPower3() ...@@ -262,7 +272,9 @@ bool TestPower3()
bUserGPU = Power(*aGPU, 0.0F); bUserGPU = Power(*aGPU, 0.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/ */
#include "TReduceSum.h" #include "TReduceSum.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -155,6 +156,457 @@ bool TestReduceSum1() ...@@ -155,6 +156,457 @@ bool TestReduceSum1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B >= 128
(50, 1000000) -> (50), dim = 1
*/
bool TestReduceSum2()
{
/* a tensor of size (50, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B < 128
(1000000, 50) -> (1000000), dim = 1
*/
bool TestReduceSum3()
{
/* a tensor of size (1000000, 50) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 1000000;
sDimSize[1] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (1000000) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 1000000;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A < 10, B is free
(5, 1000000) -> (5), dim = 1
*/
bool TestReduceSum4()
{
/* a tensor of size (5, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 5;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (5) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 5;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C > 4096
(500, 1000, 500) -> (500, 500), dim = 1
*/
bool TestReduceSum5()
{
/* a tensor of size (500, 1000, 500) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 500;
sDimSize[1] = 1000;
sDimSize[2] = 500;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (500, 500) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C <= 4096
(50, 10000, 50) -> (50, 50), dim = 1
*/
bool TestReduceSum6()
{
/* a tensor of size (50, 10000, 50) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 10000;
sDimSize[2] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50, 50) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -175,6 +627,51 @@ bool TestReduceSum() ...@@ -175,6 +627,51 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceSum2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* case 3 test */
//caseFlag = TestReduceSum3();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 3 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestReduceSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
///* case 5 test */
//caseFlag = TestReduceSum5();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 5 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestReduceSum6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -146,7 +146,7 @@ bool TestSoftmax2() ...@@ -146,7 +146,7 @@ bool TestSoftmax2()
_Softmax(x, y, 1); _Softmax(x, y, 1);
/* call SoftmaxBackward function */ /* call SoftmaxBackward function */
_SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY); _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -174,7 +174,7 @@ bool TestSoftmax2() ...@@ -174,7 +174,7 @@ bool TestSoftmax2()
_Softmax(xGPU, yGPU, 1); _Softmax(xGPU, yGPU, 1);
/* call SoftmaxBackward function */ /* call SoftmaxBackward function */
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY); _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
...@@ -20,8 +20,9 @@ ...@@ -20,8 +20,9 @@
*/ */
#include "TSumDim.h" #include "TSumDim.h"
#include "../core/arithmetic/SumDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -251,6 +252,225 @@ bool TestSumDim2() ...@@ -251,6 +252,225 @@ bool TestSumDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim3()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
*/
bool TestSumDim4()
{
/* a tensor of size (200, 40, 4000) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 1000000;
aDimSize[1] = 50;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 50;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -279,6 +499,24 @@ bool TestSumDim() ...@@ -279,6 +499,24 @@ bool TestSumDim()
} }
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSumDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
///* case 4 test */
//caseFlag = TestSumDim4();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 4 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论