Commit 03a9836e by xuchen

1. add some base functions 2.better implementation for t2t

parent 52c0e35a
...@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_LOGSOFTMAX){ else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!"); CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
} }
else if(operID == FUNC_RECTIFY) else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS); _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
...@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_SOFTMAX){ else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!"); CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
} }
else{ else{
ShowNTErrors("Wrong activation function type!"); ShowNTErrors("Wrong activation function type!");
......
...@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x) ...@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy >> lossName - name of the loss, e.g., cross entropy
*/ */
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params, int funcID, void * params,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
...@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, ...@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
} }
else if(funcID == FUNC_LOGSOFTMAX){ else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params; int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
} }
else if(funcID == FUNC_RECTIFY){ else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName); _RectifyBackward(gold, y, x, dedy, dedx, lossName);
...@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, ...@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
_SigmoidBackward(gold, y, x, dedy, dedx, lossName); _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){ }else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params; int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
} }
else{ else{
ShowNTErrors("wrong function found when call the backward process!"); ShowNTErrors("wrong function found when call the backward process!");
...@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E ...@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy >> lossName - name of the loss, e.g., cross entropy
*/ */
void XLossGrad::Compute(XTensor * gold, XTensor * y, void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
_LossBackward(dedy, gold, y, lossName); //_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
} }
} }
\ No newline at end of file
...@@ -36,13 +36,13 @@ class XLossGrad ...@@ -36,13 +36,13 @@ class XLossGrad
public: public:
/* compute dE/dx for a given function y = f(x) */ /* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x, void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params, int funcID, void * params,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */ /* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y, void Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
}; };
......
...@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient) ...@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
DelTensorBuf(b); DelTensorBuf(b);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
......
...@@ -55,7 +55,7 @@ void XNetClearAll() ...@@ -55,7 +55,7 @@ void XNetClearAll()
XNet::XNet() XNet::XNet()
{ {
nodes.Clear(); nodes.Clear();
isGradEfficient = true; isGradEfficient = false;
} }
/* de-constructor */ /* de-constructor */
...@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss) ...@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
XList golds(1); XList golds(1);
golds.Add(&gold); golds.Add(&gold);
Backward(roots, golds, loss); XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
{
XList roots(1);
roots.Add(&root);
XList golds(1);
golds.Add(&gold);
XList paddings(1);
paddings.Add(&padding);
Backward(roots, golds, paddings, loss);
} }
/* /*
...@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss) ...@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList golds(1); XList golds(1);
golds.Add(NULL); golds.Add(NULL);
Backward(roots, golds, loss); XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
} }
/* /*
...@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function ...@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes with a number of root nodes
>> root - a list of root nodes (output) of the network >> root - a list of root nodes (output) of the network
>> gold - a list of gold standard for the output >> gold - a list of gold standard for the output
>> padding - specify a target value that is ignored
>> loss - name of loss function >> loss - name of loss function
*/ */
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
{ {
Traverse(roots); Traverse(roots);
...@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for(int i = 0; i < roots.count; i++){ for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i); XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i); XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income; XLink &income = root->income;
int funcID = income.typeID; int funcID = income.typeID;
void * params = income.params; void * params = income.params;
...@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
Note that we do not need to obtain dE/dy here because it is no use in the Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */ folloing process of back-propagation */
if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){ if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
XTensor * x = income.tails[0]; XTensor * x = income.tails[0];
XNoder::MakeGrad(x); XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss); lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED; root->visitMark = NODE_FINISHED;
} }
else {
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, padding, loss);
}
}
/* we compuate dE/dy (y is the output) if no predefined activation function is used */ /* we compuate dE/dy (y is the output) if no predefined activation function is used */
else{ else{
XNoder::MakeGrad(root); XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, loss); lossGrad.Compute(gold, root, root->grad, NULL, loss);
} }
} }
...@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
/* /*
backward propagation to obtain gradient backward propagation to obtain gradient
with a number of root nodes with a number of root nodes
>> root - a list of root nodes (output) of the network >> roots - a list of root nodes (output) of the network
>> loss - name of loss function >> loss - name of loss function
*/ */
void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss) void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
{ {
XList golds(roots.count); XList golds(roots.count);
for(int i = 0; i < roots.count; i++) XList paddings(roots.count);
for(int i = 0; i < roots.count; i++) {
golds.Add(NULL); golds.Add(NULL);
paddings.Add(NULL);
}
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
{
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++)
paddings.Add(NULL);
Backward(roots, golds, loss); Backward(roots, golds, paddings, loss);
} }
/* /*
......
...@@ -62,17 +62,24 @@ struct XNet ...@@ -62,17 +62,24 @@ struct XNet
/* backward propagation to obtain gradient wrt. the loss/error function */ /* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */ /* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function /* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient /* backward propagation to obtain gradient
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */ /* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false); void BackwardNode(XTensor * node, bool isEfficent = false);
......
...@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if(isEnd) if(isEnd)
break; break;
Test(testFN, outputFN, model);
} }
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
...@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds /* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */ where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss); _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get /* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttDecoder::AttDecoder()
{
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor res;
XTensor nothing;
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
/* layer normalization */
x = attEndeLayerNorms[i].Make(res);
/*******/
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
return x;
}
}
...@@ -22,19 +22,33 @@ ...@@ -22,19 +22,33 @@
#ifndef __T2TDECODER_H__ #ifndef __T2TDECODER_H__
#define __T2TDECODER_H__ #define __T2TDECODER_H__
#include "T2TEncoder.h"
namespace transformer namespace transformer
{ {
class T2TDecoder class AttDecoder : public AttEncoder
{ {
public:
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
}; /* layer normalization for encoder-decoder attention */
T2TLN * attEndeLayerNorms;
class AttDecoder : T2TDecoder
{
public: public:
/* constructor */
AttDecoder();
/* deconstructor */
~AttDecoder();
/* initialize the model */ /* initialize the model */
void InitModel(int argc, char ** argv); void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
}; };
} }
......
...@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
DTYPE v = 1.0F/(float)sqrt((float)eSize); DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRand(-v, v); w.SetDataRandn(0, v);
/* create the positional embedding matrix */ /* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength); MakePosEmbedding(eSize, d, maxLength);
} }
/* /*
make positional embeddings (of size eSize * length make positional embeddings (of size eSize * length)
eSize - embedding size >> eSize - embedding size
length - length of the sequenc >> d - dimension size of the hidden layers
>> length - length of the sequence
*/ */
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{ {
...@@ -114,15 +115,15 @@ make the network ...@@ -114,15 +115,15 @@ make the network
*/ */
XTensor T2TEmbedder::Make(XTensor &input) XTensor T2TEmbedder::Make(XTensor &input)
{ {
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!"); //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!"); CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!"); CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\""); CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int)); memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize; dims[input.order] = eSize;
XTensor wordEmbedding; XTensor wordEmbedding;
XTensor posEmbedding; XTensor posEmbedding;
...@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */ /* we make positional embeddings first */
//if(!match){ //if(!match){
if(true){ if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem); XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0); _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
...@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
} }
/* then we make word embeddings */ /* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize)); //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */ /* we sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding + posEmbedding;
......
...@@ -31,6 +31,10 @@ namespace transformer ...@@ -31,6 +31,10 @@ namespace transformer
/* constructor */ /* constructor */
AttEncoder::AttEncoder() AttEncoder::AttEncoder()
{ {
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
} }
/* de-constructor */ /* de-constructor */
......
...@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor1D(&w, d, X_FLOAT, devID, mem); InitTensor1D(&w, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem); InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F; w.SetDataRand(1.0F, 1.0F);
float finfout = (float)sqrt(6.0F * scale / d);
w.SetDataRand(-finfout, finfout);
b.SetZeroAll(); b.SetZeroAll();
} }
......
...@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
LoadParamInt(argc, argv, "dev", &devID, -1); LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem); LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024); LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false); LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamBool(argc, argv, "lm", &isLM, !isMT);
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false); LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
...@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
encoder.InitModel(argc, argv, isLM, 0, devID, mem); encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem); outputLayer.InitModel(argc, argv, devID, mem);
if(isMT)
decoder.InitModel(argc, argv, true, 0, devID, mem);
XList params(10); XList params(10);
GetParams(params); GetParams(params);
...@@ -87,31 +90,52 @@ make the encoding network ...@@ -87,31 +90,52 @@ make the encoding network
>> isTraining - indicates whether we are training the model >> isTraining - indicates whether we are training the model
<< return - encoding result << return - encoding result
*/ */
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining) XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
{ {
return encoder.Make(input, mask, isTraining); return encoder.Make(input, mask, isTraining);
} }
/* /*
make the entire network (with the output softmax layer) make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
return decoder.Make(inputDec, outputEnc, mask, isTraining);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor >> input - input tensor
>> output - output tensor (distribution) >> output - output tensor (distribution)
>> padding - padding of the sequences >> padding - padding of the sequences
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining) void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{ {
XTensor encoding; XTensor encoding;
if(isLM){
/* generate mask to see "previous" words only */ /* generate mask to see "previous" words only */
int len = input.GetDim(input.order - 2); //int len = input.GetDim(input.order - 2);
int * dims = new int[input.order + 1]; //int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++) for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i); dims[i + 1] = input.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[input.order] = len; dims[input.order + 1] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem); XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in this matrix can be used to prevent the attention to current or following words in
...@@ -132,29 +156,95 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTr ...@@ -132,29 +156,95 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTr
dimsPadding[i + 1] = padding2->GetDim(i); dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead; dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType, //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem); // padding.denseRatio, padding.devID, padding.mem);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining);
outputLayer.Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
/* generate mask to see "previous" words on the decoder side */
int len = inputDec.GetDim(inputDec.order - 2);
int * dims = new int[inputDec.order + 1];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order] = len;
InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
/* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
/* mask of the padding */ /* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1)); _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead); _Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F); _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
_Sum(&mask, padding3, &mask); InitTensor(&maskEnc, padding3);
maskEnc.SetZeroAll();
encoding = MakeEncoding(input, mask, isTraining); /* generate the mask on the source language side (for padding) */
outputLayer.Make(encoding, output); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
outputLayer.Make(decoding, output);
delete[] dims; delete[] dims;
delete[] dimsPadding; delete[] dimsPadding;
DelTensorBuf(padding2);
DelTensorBuf(padding3); DelTensorBuf(padding3);
} DelTensorBuf(padding2);
else{
ShowNTErrors("TODO!");
}
} }
/* /*
...@@ -182,6 +272,31 @@ void T2TModel::GetParams(XList &list) ...@@ -182,6 +272,31 @@ void T2TModel::GetParams(XList &list)
} }
list.Add(&encoder.embedder.w); list.Add(&encoder.embedder.w);
if(isMT){
for(int i = 0; i < decoder.nlayer; i++){
list.Add(&decoder.fnns[i].w1);
list.Add(&decoder.fnns[i].b1);
list.Add(&decoder.fnns[i].w2);
list.Add(&decoder.fnns[i].b2);
list.Add(&decoder.attentionsEnde[i].wk);
list.Add(&decoder.attentionsEnde[i].wq);
list.Add(&decoder.attentionsEnde[i].wv);
list.Add(&decoder.attentionsEnde[i].wa);
list.Add(&decoder.attEndeLayerNorms[i].w);
list.Add(&decoder.attEndeLayerNorms[i].b);
list.Add(&decoder.attentions[i].wk);
list.Add(&decoder.attentions[i].wq);
list.Add(&decoder.attentions[i].wv);
list.Add(&decoder.attentions[i].wa);
list.Add(&decoder.fnnLayerNorms[i].w);
list.Add(&decoder.fnnLayerNorms[i].b);
list.Add(&decoder.attLayerNorms[i].w);
list.Add(&decoder.attLayerNorms[i].b);
}
list.Add(&decoder.embedder.w);
}
} }
/* /*
......
...@@ -69,10 +69,16 @@ public: ...@@ -69,10 +69,16 @@ public:
void InitModel(int argc, char ** argv); void InitModel(int argc, char ** argv);
/* make the encoding network */ /* make the encoding network */
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining); XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the entire network (with the output softmax layer) */ /* make the encoding network */
void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining); XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);
/* get parameter matrics */ /* get parameter matrics */
void GetParams(XList &list); void GetParams(XList &list);
......
...@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float scale = 1.0F; float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize)); float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout); w.SetDataRand(-finfout, finfout);
DTYPE v = 1.0F/(float)sqrt((float)hSize);
w.SetDataRandn(0, v);
} }
/* /*
...@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{ {
XTensor &x = input; XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1); //output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
} }
} }
...@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamInt(argc, argv, "d", &d, 512); LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000); LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1); LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false); LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000); LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
LoadParamBool(argc, argv, "adam", &useAdam, false); LoadParamBool(argc, argv, "adam", &useAdam, false);
...@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false); LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, argv, "updatestep", &updateStep, 1); LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false); LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, false);
buf = new int[bufSize]; buf = new int[bufSize];
buf2 = new int[bufSize]; buf2 = new int[bufSize];
...@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
adamBeta1T = 1.0F; adamBeta1T = 1.0F;
adamBeta2T = 1.0F; adamBeta2T = 1.0F;
validStep = 0;
curEpoch = 0;
} }
int tc = 0; int tc = 0;
...@@ -133,9 +138,10 @@ train the model ...@@ -133,9 +138,10 @@ train the model
>> modelFN - where we keep the model >> modelFN - where we keep the model
>> model - model to train >> model - model to train
*/ */
void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model) bool T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
{ {
int epoch = 0; curEpoch += 1;
int step = 0; int step = 0;
int wc = 0; int wc = 0;
int wordCount = 0; int wordCount = 0;
...@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int nCheckpoint = 0; int nCheckpoint = 0;
int nSkipped = 0; int nSkipped = 0;
int gradStep = 0; int gradStep = 0;
int validStep = 0; //int validStep = 0;
char * trainFN = new char[(int)strlen(fn) + 10]; char * trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn); strcpy(trainFN, fn);
...@@ -157,15 +163,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -157,15 +163,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
sprintf(trainFN, "%s.random", fn); sprintf(trainFN, "%s.random", fn);
#endif #endif
PrepareModel(model);
int devID = model->devID; int devID = model->devID;
XMem * mem = model->mem; XMem * mem = model->mem;
XNet net; XNet net;
PrepareModel(model);
double startT = GetClockSec(); double startT = GetClockSec();
for(epoch = 1; epoch <= nepoch; epoch++){ //for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32 #ifndef WIN32
if(isShuffled) if(isShuffled)
Shuffle(fn, trainFN); Shuffle(fn, trainFN);
...@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
wordCount = 0; wordCount = 0;
loss = 0; loss = 0;
/* batch of input sequences */ /* batch of sequences (on the encoder and decoder sides) */
XTensor batch; XTensor batchEnc;
XTensor batchDec;
/* padding */ /* padding */
XTensor padding; XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */ /* gold standard */
XTensor gold; XTensor gold;
...@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */ /* label smoothed gold standard (if needed) */
XTensor goldSmoothed; XTensor goldSmoothed;
while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) { while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, wc, devID, mem, true))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
/* make the network */ /* make the network */
model->Make(batch, output, padding, true); if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, true);
else{
ShowNTErrors("Illegal model type!");
}
/* back-propagation for obtaining gradients */ /* back-propagation for obtaining gradients */
if (labelSmoothingP > 0) if (labelSmoothingP > 0)
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP); LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */ /* make paddings for the output */
if (output.GetDim(0) > 1) //if (output.GetDim(0) > 1)
PadOutput(&output, &gold, &padding); // PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
//fflush(tmpFILE);
/* get probabilities */ /* get probabilities */
float prob = GetProb(&output, &gold, NULL); float prob = GetProb(&output, &gold, NULL);
DTYPE lossLocal = -prob / wc; DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
...@@ -217,17 +239,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -217,17 +239,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (doUpdate) { if (doUpdate) {
/* recale the output for normalized loss */ /* recale the output for normalized loss */
RescaleOutput(&output, &g, &padding); //RescaleOutput(&output, &g, &paddingDec);
/* back-propagation */ /* back-propagation */
net.Backward(output, g, CROSSENTROPY); net.Backward(output, g, paddingDec, CROSSENTROPY);
/*for(int i = 0; i < net.nodes.count; i++){
XTensor * node = (XTensor*)net.nodes.Get(i);
XLink::ShowNode(stderr, node);
}
exit(0);*/
gradStep += 1; gradStep += 1;
loss += -prob; loss += -prob;
...@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break; break;
} }
if (step % 1 == 0) { if (step % 100 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc)); lr, elapsed, step, curEpoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
if (!doUpdate) if (!doUpdate)
XPRINT(0, stderr, " (no update)"); XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n"); XPRINT(0, stderr, "\n");
...@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
fclose(file); fclose(file);
if (isEnd) if (isEnd)
break; return false;
return true;
if(useEpochCheckpoint) //if(useEpochCheckpoint)
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch); // MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
} //}
double elapsed = GetClockSec() - startT; //double elapsed = GetClockSec() - startT;
//
epoch = MIN(epoch, nepoch); //epoch = MIN(epoch, nepoch);
//
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n", //XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount)); // lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n", //XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
elapsed, step, nSkipped, epoch); // elapsed, step, nSkipped, epoch);
delete[] trainFN; delete[] trainFN;
} }
...@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
wordCount = 0; wordCount = 0;
/* batch of input sequences */ /* batch of input sequences */
XTensor batch; XTensor batchEnc;
XTensor batchDec;
/* padding */ /* padding */
XTensor padding; XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */ /* gold standard */
XTensor gold; XTensor gold;
...@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf(); ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){ while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold,
seqs, vSize, vSizeTgt,
1, 1, false, wc, devID, mem, false))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch"); //CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
/* make the network */ /* make the network */
model->Make(batch, output, padding, false); if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, false);
else{
ShowNTErrors("Illegal model type!");
}
int bSize = batch.GetDim(0); int bSize = output.GetDim(0);
int length = batch.GetDim(1); int length = output.GetDim(1);
/* prediction probabilities */ /* prediction probabilities */
XTensor probs; XTensor probs;
...@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
/* sort the sequences by length */ /* sort the sequences by length */
if (isSorted) { if (isSorted) {
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
SampleNode * nodes = new SampleNode[seqCount]; SampleNode * nodes = new SampleNode[seqCount];
int count = 0; int count = 0;
int offset = 0; int offset = 0;
...@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
offset += node.size; offset += node.size;
} }
qsort(nodes, seqCount, sizeof(SampleNode), CompareSampleNode); qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
count = 0; count = 0;
offset = 0; offset = 0;
for(int i = 0; i < seqCount; i++){ for(int i = 0; i < seqCount; i += step){
SampleNode &node = nodes[count]; SampleNode &node = nodes[count];
//fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
memcpy(buf2 + offset, node.p, sizeof(int) * node.size); memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for(int j = 0; j < step; j++){ for(int j = 0; j < step; j++){
seqLen2[count + j] = seqLen[node.id + j]; seqLen2[i + j] = seqLen[node.id + j];
seqOffset[count + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0); seqOffset[i + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
} }
count += step; count += 1;
offset += node.size; offset += node.size;
} }
...@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step) ...@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
buf = buf2; buf = buf2;
buf2 = tmp; buf2 = tmp;
tmp = seqLen; tmp = seqLen;
seqLen = seqLen2; seqLen = seqLen2;
seqLen2 = tmp; seqLen2 = tmp;
...@@ -566,28 +594,75 @@ void T2TTrainer::ClearBuf() ...@@ -566,28 +594,75 @@ void T2TTrainer::ClearBuf()
load a batch of sequences load a batch of sequences
>> file - the handle to the data file >> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms >> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences >> batchEnc - the batch of the input sequences
>> padding - padding of the input sequences >> paddingEnc - padding of the input sequences
>> output - the batch of the output sequences >> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array >> seqs - keep the sequences in an array
>> step - the step we go over when move to the next sequence >> vsEnc - size of the encoder vocabulary
>> vs - vocabulary size >> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences >> sBatch - batch size of sequences
>> wBatch - batch size of words >> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length >> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count >> wCount - word count
>> devID - device id >> devID - device id
>> mem - memory pool >> mem - memory pool
>> isTraining - indicates whether we are training the model
*/ */
int T2TTrainer::LoadBatch(FILE * file, bool isLM, int T2TTrainer::LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int * seqs,
int step, int vs, int sBatch, int wBatch, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &wCount,
int devID, XMem * mem) int devID, XMem * mem,
bool isTraining)
{
if(isLM){
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
else{
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{ {
if(nextSeq < 0 || nextSeq >= nseqBuf) if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, step); LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0); int seq = MAX(nextSeq, 0);
int wc = 0; int wc = 0;
...@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(max < wn) if(max < wn)
max = wn; max = wn;
if(sc >= sBatch && wc >= wBatch) int tc = isSmallBatch ? max * sc : wc;
if(sc >= sBatch && tc >= wBatch)
break; break;
} }
...@@ -614,37 +690,32 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -614,37 +690,32 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(sc <= 0) if(sc <= 0)
return 0; return 0;
if(isLM){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc; dims[0] = sc;
dims[1] = max; dims[1] = max;
dims[2] = vs; dims[2] = vs;
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem); InitTensor(batchEnc, 2, dims, X_INT, 1.0F, -1);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem); //InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem); InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(batch->grad == NULL) InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
XNoder::MakeGrad(batch);
else batchEnc->SetZeroAll();
InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem); paddingEnc->SetZeroAll();
gold->SetZeroAll();
if(padding->grad == NULL) paddingDec->SetZeroAll();
XNoder::MakeGrad(padding);
else if(isTraining) {
InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem); //XNoder::MakeGrad(batchEnc);
XNoder::MakeGrad(paddingEnc);
if(output->grad == NULL) XNoder::MakeGrad(gold);
XNoder::MakeGrad(output); XNoder::MakeGrad(paddingDec);
else //batchEnc->grad->SetZeroAll();
InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem); paddingEnc->grad->SetZeroAll();
gold->grad->SetZeroAll();
batch->SetZeroAll(); paddingDec->grad->SetZeroAll();
padding->SetZeroAll(); }
output->SetZeroAll();
batch->grad->SetZeroAll();
padding->grad->SetZeroAll();
output->grad->SetZeroAll();
int seqSize = 0; int seqSize = 0;
...@@ -655,16 +726,20 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -655,16 +726,20 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1; int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!"); CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){ for(int w = 0; w < len; w++){
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]); batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
padding->Set2D(1.0F, s - seq, w); //batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
if(w > 0) paddingEnc->Set2D(1.0F, s - seq, w);
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]); paddingDec->Set2D(1.0F, s - seq, w);
if(w == len - 1){ if (w > 0)
if(isDoubledEnd) gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
if (w == len - 1) {
if (isDoubledEnd)
gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else else
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]); gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
} }
wCount++; wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]); /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1) if(w < seqLen[s] - 1)
...@@ -682,6 +757,138 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -682,6 +757,138 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
} }
fflush(tf); fflush(tf);
return sc;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
/* target-side sequence */
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
wnDec = len;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
int tc = isSmallBatch ? maxEnc * sc / 2 : wcEnc;
if(sc >= sBatch * 2 && tc >= wBatch)
break;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;
int sCount = sc/2;
int seqSize = 0;
int dimsEnc[3] = {sCount, maxEnc, vsEnc};
int dimsDec[3] = {sCount, maxDec, vsDec};
InitTensor(batchEnc, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
InitTensor(batchDec, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
gold->SetZeroAll();
wCount = 0;
/* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq)/2;
for(int w = 0; w < len; w++){
batchEnc->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
paddingEnc->Set2D(1.0F, sent, w);
wCount++;
}
}
/* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= maxDec, "Something is wrong!");
int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){
paddingDec->Set2D(1.0F, sent, w);
batchDec->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
if(w > 0)
gold->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
if (w == len - 1) {
if(isDoubledEnd)
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
else
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if(seqs != NULL){
for(int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
} }
return sc; return sc;
...@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs; XTensor probs;
InitTensor(&probs, output); InitTensor(&probs, output);
XTensor logOutput;
InitTensor(&logOutput, output);
_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */ /* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(output, gold, &probs); _Multiply(&logOutput, gold, &probs);
/* probability of each word */ /* probability of each word */
XTensor wprobs; XTensor wprobs;
...@@ -886,17 +1097,12 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi ...@@ -886,17 +1097,12 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
CheckNTErrors(output->order == 3, "Wrong dimension number!"); CheckNTErrors(output->order == 3, "Wrong dimension number!");
CheckNTErrors(gold->order == 3, "Wrong dimension number!"); CheckNTErrors(gold->order == 3, "Wrong dimension number!");
int num = padding->GetDim(0); DTYPE count = _ReduceSumAll(padding);
XTensor * factor = NewTensorBuf(1, &num, padding->dataType, 1.0F, padding->devID, padding->mem);
_ReduceSum(padding, factor, padding->order - 1);
_ExpMe(output); _ExpMe(output);
_DivDim(output, factor, output, 0); _ScaleAndShiftMe(output, 1/count);
_LogMe(output); _LogMe(output);
_DivDim(gold, factor, gold, 0); _ScaleAndShiftMe(gold, 1/count);
DelTensorBuf(factor);
} }
/* /*
......
...@@ -79,6 +79,9 @@ public: ...@@ -79,6 +79,9 @@ public:
/* vocabulary size of the source side */ /* vocabulary size of the source side */
int vSize; int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */ /* learning rate */
float lrate; float lrate;
...@@ -100,6 +103,10 @@ public: ...@@ -100,6 +103,10 @@ public:
/* indicates whether we use adam */ /* indicates whether we use adam */
bool useAdam; bool useAdam;
int validStep;
int curEpoch;
/* hyper parameters of adam*/ /* hyper parameters of adam*/
float adamBeta1; float adamBeta1;
float adamBeta2; float adamBeta2;
...@@ -128,9 +135,14 @@ public: ...@@ -128,9 +135,14 @@ public:
/* number of batches on which we do model update */ /* number of batches on which we do model update */
int updateStep; int updateStep;
/* indicates whether we double the </s> symble for the output of lms */ /* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd; bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
public: public:
/* constructor */ /* constructor */
T2TTrainer(); T2TTrainer();
...@@ -142,7 +154,7 @@ public: ...@@ -142,7 +154,7 @@ public:
void Init(int argc, char ** argv); void Init(int argc, char ** argv);
/* train the model */ /* train the model */
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model); bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */ /* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model); void Test(const char * fn, const char * ofn, T2TModel * model);
...@@ -158,11 +170,34 @@ public: ...@@ -158,11 +170,34 @@ public:
/* load a batch of sequences */ /* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM, int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int * seqs,
int step, int vs, int sBatch, int wBatch, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &wCount,
int devID, XMem * mem); int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */ /* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile); void Shuffle(const char * srcFile, const char * tgtFile);
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TTrainer.h" #include "T2TTrainer.h"
#include "../../tensor/XDevice.h" #include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer namespace transformer
{ {
...@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv) ...@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
/* learn model parameters */
if(strcmp(trainFN, "")) {
double startT = GetClockSec();
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
char * fn = new char[MAX_LINE_LENGTH];
char * fn1 = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
int epoch;
bool isTrain;
for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
if(epoch == 1) {
T2TModel model; T2TModel model;
model.InitModel(argc, args);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
else {
T2TModel model;
model.InitModel(argc, args); model.InitModel(argc, args);
model.Read(fn);
/* learn model parameters */ isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
if(strcmp(trainFN, "")) model.Dump(fn1);
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); }
if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
T2TTrainer tester;
tester.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
model.Read(fn1);
tester.Test(testFN, fn2, &model);
}
if(!isTrain)
break;
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, trainer.nepoch);
XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);
delete[] fn;
delete[] fn1;
delete[] fn2;
}
/* don't dump the final model */
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN); // model.Dump(modelFN);
T2TModel model;
model.InitModel(argc, args);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) if(strcmp(modelFN, ""))
......
...@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation) ...@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
if(!myIsForComputation && devID >= 0 && cublasHandle != NULL) if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
cublasDestroy(cublasHandle); cublasDestroy(cublasHandle);
if(myIsForComputation) if(myIsForComputation)
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle."); CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
SetDevice(devIDBackup); SetDevice(devIDBackup);
#endif #endif
...@@ -1392,7 +1393,7 @@ void XMem::CreateBLASHandle() ...@@ -1392,7 +1393,7 @@ void XMem::CreateBLASHandle()
"Cannot destroy the cublas handle."); "Cannot destroy the cublas handle.");
} }
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle."); "Cannot create the cublas handle.");
#endif #endif
} }
......
...@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i) ...@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
/* /*
set the value of a cell set the value of a cell
>> value - value to assign to the cell >> value - value we tend to set
>> index - index of the cell for each dimension >> index - index of the cell for each dimension
>> >> size - size of the index
*/ */
bool XTensor::Set(DTYPE value, int index[], int size) bool XTensor::Set(DTYPE value, int index[], int size)
{ {
...@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size) ...@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
/* /*
set the value of a cell in a 1d tensor set the value of a cell in a 1d tensor
>> value - value to assign to the cell >> value - value we tend to set
>> i - item offset >> i - item offset
<< return - succeeded or not
*/ */
bool XTensor::Set1D(DTYPE value, int i) bool XTensor::Set1D(DTYPE value, int i)
{ {
...@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2) ...@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
return SetToDevice(devID, GetCell(dims, 3), value); return SetToDevice(devID, GetCell(dims, 3), value);
} }
/*
set the integer value of a cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>> size - size of the index
<< return - succeeded or not
*/
bool XTensor::SetInt(int value, int index[], int size)
{
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
return SetToDeviceInt(devID, GetCell(index, size), value);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1DInt(int value, int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[1] = {i};
return SetToDeviceInt(devID, GetCell(dims, 1), value);
}
/*
set the integer value of a cell in a 2d tensor in default type
>> value - value we tend to set
>> ni - row index
>> mi - column index
<< return - succeeded or not
*/
bool XTensor::Set2DInt(int value, int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[2] = {ni, mi};
return SetToDeviceInt(devID, GetCell(dims, 2), value);
}
/*
set the integer value of a cell in a 3d tensor in default type
>> value - value we tend to set
>> d0 - index of demension 0
>> d1 - index of demension 1
>> d2 - index of demension 2
<< return - succeeded or not
*/
bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[3] = {d0, d1, d2};
return SetToDeviceInt(devID, GetCell(dims, 3), value);
}
/* /*
increase the value of a cell in a 2d tensor increase the value of a cell in a 2d tensor
>> value - value we tend to set >> value - value we tend to set
...@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, ...@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem); XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
}
if(myMem != NULL) if(myMem != NULL)
tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize); tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
else else
......
...@@ -327,6 +327,18 @@ public: ...@@ -327,6 +327,18 @@ public:
/* set the value of a cell in a 3d tensor */ /* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2); bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
/* set the integer value of a cell in a 2d tensor */
bool Set2DInt(int value, int ni, int mi);
/* set the integer value of a cell in a 3d tensor */
bool Set3DInt(int value, int d0, int d1, int d2);
/* increase the value of a cell in a 2d */ /* increase the value of a cell in a 2d */
bool Add2D(DTYPE value, int ni, int mi); bool Add2D(DTYPE value, int ni, int mi);
......
...@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value) ...@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return true; return true;
} }
/* assign a integer number to a variable that is kept on a specified device */
bool SetToDeviceInt(int devID, void * p, int value)
{
if(p == NULL)
return false;
if(devID < 0)
*(int*)p = value;
else{
XMemCopy(p, devID, &value, -1, sizeof(int));
}
return true;
}
/* get the next number with power of 2 */ /* get the next number with power of 2 */
unsigned int GetNextPower2(unsigned int n) unsigned int GetNextPower2(unsigned int n)
{ {
......
...@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p); ...@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value); extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value); extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value); extern bool SetToDevice(int devID, void * p, DTYPE value);
extern bool SetToDeviceInt(int devID, void * p, int value);
extern unsigned int GetNextPower2(unsigned int n); extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime); extern void XSleep(int sleepTime);
extern double GetClock(); extern double GetClock();
......
...@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize; fanOut = numOutputFmaps * receptiveFieldSize;
} }
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut)); DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0) * std; tensor->SetDataRand(-finfout, finfout);
_SetDataRand(tensor, -a, a); //_SetDataRand(tensor, -finfout, finfout);
} }
/* /*
...@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if(tensor == NULL) if(tensor == NULL)
return; return;
/* GPU code */ /* CPU code */
if(tensor->devID < 0){ if(tensor->devID < 0){
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include "Gather.h" #include "Gather.h"
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "../../XUtility.h"
#include "../shape/Reshape.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize) ...@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
return result; return result;
} }
/*
gather indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor(2D)
>> index - the index tensor
<< return - the result of copying indexed sub-tensors
*/
XTensor Gather(const XTensor &s, const XTensor &index)
{
int indexSize = index.unitNum;
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int * srcIndex = new int[index.unitNum];
if(index.dataType == X_INT) {
XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
}
else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
DTYPE * tmp = new DTYPE[indexSize];
XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
for(int i = 0; i < indexSize; i++)
srcIndex[i] = (int)tmp[i];
delete[] tmp;
}
XTensor tensor;
tensor = Gather(s, 0, srcIndex, indexSize);
delete[] srcIndex;
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = tensor.GetDim(-1);
XTensor t;
t = Reshape(tensor, index.order + 1, dims);
delete[] dims;
return t;
}
else {
return tensor;
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS ...@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize); XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, const XTensor &index);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __GATHER_H__ #endif // __GATHER_H__
\ No newline at end of file
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include <math.h> #include <math.h>
#include "ReduceSum.h" #include "ReduceSum.h"
......
...@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.x * blockDim.y + threadIdx.y; int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i >= stride * blockNum) if(i >= stride * blockNum)
return; return;
if(threadIdx.y == 0) if(threadIdx.x == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0; bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads(); __syncthreads();
...@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
int iOffset = i % stride; int iOffset = i % stride;
bool isValid = (i < stride * blockNum && j < strideNum); bool isValid = (i < stride * blockNum && j < strideNum);
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.x] : 0; DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;
if(power != (DTYPE)1.0){ if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0) if(power == (DTYPE)2.0)
...@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
value = exp(value); value = exp(value);
/* load data into the shared mem */ /* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; iData[threadIdx.y * blockDim.x + threadIdx.x] = value;
__syncthreads(); __syncthreads();
/* do reduction in shared mem */ /* do reduction in shared mem */
for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){ for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (threadIdx.y < s) if (threadIdx.x < s)
iData[idx] += iData[idx + s]; iData[idx] += iData[idx + s];
__syncthreads(); __syncthreads();
} }
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
} }
/* /*
...@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.y; unsigned int tid = threadIdx.x;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y; unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
if(i >= stride * blockNum) if(i >= stride * blockNum)
return; return;
if (threadIdx.y == 0) if (threadIdx.x == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0; bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads(); __syncthreads();
...@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
int iOffset = i % stride; int iOffset = i % stride;
bool isValid = j < strideNum; bool isValid = j < strideNum;
bool isValid2 = j + blockDim.y < strideNum; bool isValid2 = j + blockDim.x < strideNum;
DTYPE * data = iData + threadIdx.x * blockDim.y; DTYPE * data = iData + threadIdx.y * blockDim.x;
DTYPE * inputData = input + k * blockSize; DTYPE * inputData = input + k * blockSize;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.x]: 0; DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.y) * stride + iOffset] - bias[threadIdx.x]: 0; DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
if(power != (DTYPE)1.0){ if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0){ if(power == (DTYPE)2.0){
value = value * value; value = value * value;
value2 = value2 *value2; value2 = value2 * value2;
} }
else if(power == (DTYPE)0.5){ else if(power == (DTYPE)0.5){
value = sqrt(value); value = sqrt(value);
...@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
} }
value = value + value2; value = value + value2;
__syncthreads(); __syncthreads();
value = shflDownReduceSum(value); value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; } if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads(); __syncthreads();
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.x / 32)
value = data[tid]; value = data[tid];
else value = 0; else
value = 0;
value = shflDownReduceSum(value); value = shflDownReduceSum(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value; if (tid == 0 && blockIdx.x < reducedStrideNum) {
output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
}
} }
} }
...@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output, ...@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.y / 32)
threadSum = data[tid]; threadSum = data[tid];
else threadSum = 0; else
threadSum = 0;
threadSum = shflDownReduceSum(threadSum); threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum) if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
...@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long ...@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
/* /*
this situation we use block.x * grid.x deal one vector for continuous read this situation we use block.x * grid.x deal one vector for continuous read
*/ */
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum) void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
{ {
block.x = 512; block->x = 512;
block.y = 1; block->y = 1;
if ((stride * blockNum) % 512 == 0) if ((stride * blockNum) % 512 == 0)
grid.x = (stride * blockNum) / 512; grid->x = (stride * blockNum) / 512;
else else
grid.x = (stride * blockNum) / 512 + 1; grid->x = (stride * blockNum) / 512 + 1;
grid.y = 1; grid->y = 1;
} }
/* /*
adjust threads.x number then we can use warp optimization adjust threads.x number then we can use warp optimization
*/ */
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads) void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
{ {
if (threads.x > 1){ if (threads->y > 1){
blocks.x *= threads.x; blocks->y *= threads->y;
threads.x = 1; threads->y = 1;
} }
if (threads.y < 32) if (threads->x < 32)
threads.y = 32; threads->x = 32;
} }
/* /*
...@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else { else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1; if (blockNum % 4 != 0)
else blockNum = blockNum / 4; blockNum = (int)(blockNum / 4) + 1;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp); else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
} }
} }
else if (stride != 1 && stride * blockNum > 4096){ else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize); //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data; //unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride); //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block; dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum); discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp); strideNum, blockNum,sp, power, isExp);
} }
...@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
/* unroll the reduction procedure. The code is messy but it is faster. */ /* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum <= 32) { if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 128) { else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 256) { else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 512) { else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
else { else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads); adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp); blockSize, blockNum, sp, power, isExp);
} }
} }
......
...@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!) ...@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << return - the total summation
*/ */
DTYPE _ReduceSumAll(XTensor * source) DTYPE _ReduceSumAll(const XTensor * source)
{ {
int order = source->order; int order = source->order;
DTYPE summation; DTYPE summation;
XTensor * big = NewTensor(source); XTensor * big = NewTensor(source);
_CopyValues(source, big); _CopyValues(source, big);
for(int i = 0; i < order; i++) { for(int i = order - 1; i >= 0; i--) {
if(i == 0)
if(i == order - 1) big->Reshape(1, big->unitNum);
big->Reshape(big->unitNum, 1);
int leadingDim = big->order - 1;
int * dimSize; int * dimSize;
dimSize = getDimSize(big, 0); dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
_ReduceSum(big, little, 0); _ReduceSum(big, little, leadingDim);
delete big; delete big;
delete dimSize; delete dimSize;
...@@ -81,7 +82,7 @@ sum all the items of the tensor ...@@ -81,7 +82,7 @@ sum all the items of the tensor
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << return - the total summation
*/ */
DTYPE ReduceSumAll(XTensor & source) DTYPE ReduceSumAll(const XTensor & source)
{ {
return _ReduceSumAll(&source); return _ReduceSumAll(&source);
} }
......
...@@ -28,10 +28,10 @@ ...@@ -28,10 +28,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE _ReduceSumAll(XTensor * source); DTYPE _ReduceSumAll(const XTensor * source);
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE ReduceSumAll(XTensor & source); DTYPE ReduceSumAll(const XTensor & source);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int unitNum = output->dimSize[n]; int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!"); CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!"); CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!"); CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem); XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem); XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
/* l = log(output) */ _Log(output, interBuf1);
_Log(output, logBuf); _Multiply(gold, interBuf1, interBuf2);
if(weight != NULL){ if(weight != NULL)
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem); _MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */ _ReduceSum(interBuf2, loss, n);
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
}
/* negate result n = negate(mul) */ if(padding != NULL)
_NegateMe(mulBuf); _MultiplyMe(loss, padding);
_ReduceSum(mulBuf, loss, n); DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
DelTensorBuf(mulBuf);
DelTensorBuf(logBuf);
} }
/* /*
...@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight, XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n); int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order, CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!"); "Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
...@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * lossData = (DTYPE*)loss->data; DTYPE * lossData = (DTYPE*)loss->data;
DTYPE tmpLoss; DTYPE tmpLoss;
int lossPos;
int goldPos;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) lossPos = i * stride + j;
tmpLoss += -(*(goldData + beg + j)) * for(int k = 0; k < leadingDimSize; k++) {
(DTYPE)log(*(outputData + beg + j)); goldPos = i * blockSize + j + k * stride;
*(lossData + i) = tmpLoss; tmpLoss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + i) == 0) if(*(paddingData + lossPos) == 0)
*(lossData + i) = 0; *(lossData + lossPos) = 0;
else{ else {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)); tmpLoss += -(*(goldData + goldPos)) *
*(lossData + i) = tmpLoss; (DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
} }
...@@ -181,54 +186,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -181,54 +186,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) lossPos = i * stride + j;
tmpLoss += -(*(goldData + beg + j)) * for(int k = 0; k < leadingDimSize; k++) {
(DTYPE)log(*(outputData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(*(weightData + j)); tmpLoss += -(*(goldData + goldPos)) *
*(lossData + i) = tmpLoss; (DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + i) == 0) if(*(paddingData + lossPos) == 0)
*(lossData + i) = 0; *(lossData + lossPos) = 0;
else{ else {
tmpLoss = 0; tmpLoss = 0;
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
tmpLoss += -(*(goldData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(DTYPE)log(*(outputData + beg + j)) * tmpLoss += -(*(goldData + goldPos)) *
(*(weightData + j)); (DTYPE)log(*(outputData + goldPos)) *
*(lossData + i) = tmpLoss; (*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
} }
} }
} }
} }
}
/*
get the dimSize after reduce operation
>> tensor - a tensor to be reduced
>> n - the reduce dimension
<< return - the pointer of dimSize
*/
int * reduceDimSize(const XTensor * tensor, int n)
{
int order = tensor->order;
int * dimSize = new int[order - 1];
for (int i = 0; i < order; i++) {
if(i < n)
dimSize[i] = tensor->dimSize[i];
else if(i > n)
dimSize[i - 1] = tensor->dimSize[i];
} }
return dimSize;
} }
/* /*
...@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
int n = leadingDim < 0 ? output->order - 1 : leadingDim; DTYPE loss = 0;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int unitNum = output->dimSize[n]; int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold), CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!"); "The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!"); CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!"); CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem); int * dimSize = new int[order - 1];
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem); for (int i = 0; i < order; i++) {
if(i < n)
/* l = log(output) */ dimSize[i] = output->dimSize[i];
_Log(output, logBuf); else if(i > n)
dimSize[i - 1] = output->dimSize[i];
if(weight != NULL){
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
} }
/* negate multiply result n = negate(mul) */ XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
_NegateMe(mulBuf); output->devID, output->mem);
int * dimSize; _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
dimSize = reduceDimSize(output, n);
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
/* reduce sum all classes */ loss = _ReduceSumAll(lossBuf);
_ReduceSum(mulBuf, lossInter, n);
DelTensorBuf(mulBuf);
DelTensorBuf(logBuf);
DTYPE loss;
/* compute the total loss */
if(padding != NULL) {
XTensor * temp = NewTensor(lossInter);
_Multiply(lossInter, padding, temp);
loss = _ReduceSumAll(temp);
delete temp;
}
else
loss = _ReduceSumAll(lossInter);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; int nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossInter->unitNum; nonZeroNum = lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp; DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / (DTYPE)nonZeroNum;
...@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
} }
delete[] dimSize; delete[] dimSize;
delete lossInter; DelTensorBuf(lossBuf);
return loss; return loss;
} }
...@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA DTYPE loss = 0;
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
...@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -383,32 +359,40 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -383,32 +359,40 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * outputData = (DTYPE*)output->data; DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data; DTYPE * goldData = (DTYPE*)gold->data;
DTYPE loss = 0; int paddingPos;
int goldPos;
int nonZeroNum = 0; int nonZeroNum = 0;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = blockNum; nonZeroNum = blockNum * stride;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++) for(int i = 0; i < blockNum; i++) {
loss += -(*(goldData + beg + j)) * for(int j = 0; j < stride; j++) {
(DTYPE)log(*(outputData + beg + j)); paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0) for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
if(*(paddingData + paddingPos) == 0)
continue; continue;
else{ else {
nonZeroNum += 1; nonZeroNum += 1;
for(int k = 0; k < leadingDimSize; k++) {
int beg = i * blockSize; goldPos = i * blockSize + j + k * stride;
for(int j = 0; j < blockSize; j++) loss += -(*(goldData + goldPos)) *
loss += -(*(goldData + beg + j)) * (DTYPE)log(*(outputData + goldPos));
(DTYPE)log(*(outputData + beg + j)); }
}
} }
} }
} }
...@@ -416,32 +400,39 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -416,32 +400,39 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
else { else {
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = blockNum; nonZeroNum = blockNum * stride;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) paddingPos = i * stride + j;
loss += -(*(goldData + beg + j)) * for(int k = 0; k < leadingDimSize; k++) {
(DTYPE)log(*(outputData + beg + j)) * goldPos = i * blockSize + j + k * stride;
(*(weightData + j)); loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0) for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
if(*(paddingData + paddingPos) == 0)
continue; continue;
else{ else {
nonZeroNum += 1; nonZeroNum += 1;
for(int k = 0; k < leadingDimSize; k++) {
int beg = i * blockSize; goldPos = i * blockSize + j + k * stride;
for(int j = 0; j < blockSize; j++) loss += -(*(goldData + goldPos)) *
loss += -(*(goldData + beg + j)) * (DTYPE)log(*(outputData + goldPos)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j)); (*(weightData + j));
} }
} }
} }
} }
}
}
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
loss = loss / (DTYPE)nonZeroNum; loss = loss / (DTYPE)nonZeroNum;
...@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output ...@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
>> padding - specify a target value that is ignored and does not contribute to the loss computation >> padding - specify a target value that is ignored and does not contribute to the loss computation
>> leadingDim - the leading dimension for the output >> leadingDim - the leading dimension for the output
*/ */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * weight, const XTensor * padding, const XTensor * gold, const XTensor * weight,
int leadingDim) XTensor * padding, int leadingDim)
{ {
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int order = output->order; int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n); int leadingDimSize = output->GetDim(n);
...@@ -498,6 +482,25 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -498,6 +482,25 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!"); "TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1; int blockNum = 1;
int blockSize = 1; int blockSize = 1;
int stride = 1; int stride = 1;
...@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * outputData = (DTYPE*)output->data; DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data; DTYPE * goldData = (DTYPE*)gold->data;
int paddingPos;
int goldPos;
if(weight == NULL) { if(weight == NULL) {
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
*(dedyData + beg + j) = -(*(goldData + beg + j)) / goldPos = i * blockSize + j + k * stride;
(*(outputData + beg + j)); *(dedyData + goldPos) = -(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
if(*(paddingData + i) == 0) paddingPos = i * stride + j;
memset(dedyData + beg, 0, blockSize * unitSize); for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
if(*(paddingData + paddingPos) == 0)
*(dedyData + goldPos) = 0;
else else
for(int j = 0; j < blockSize; j++) *(dedyData + goldPos) = -(*(goldData + goldPos)) /
*(dedyData + beg + j) = -(*(goldData + beg + j)) / (*(outputData + goldPos));
(*(outputData + beg + j)); }
}
} }
} }
} }
...@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor ...@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * weightData = (DTYPE*)weight->data; DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) { if(padding == NULL) {
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
for(int j = 0; j < blockSize; j++) for(int k = 0; k < leadingDimSize; k++) {
*(dedyData + beg + j) = -(*(weightData + j)) * goldPos = i * blockSize + j + k * stride;
(*(goldData + beg + j)) / *(dedyData + goldPos) = -(*(weightData + k)) *
(*(outputData + beg + j)); (*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
} }
} }
else { else {
DTYPE * paddingData = (DTYPE*)padding->data; DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) { for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize; for(int j = 0; j < stride; j++) {
if(*(paddingData + i) == 0) paddingPos = i * stride + j;
memset(dedyData + beg, 0, blockSize * unitSize); for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
if(*(paddingData + paddingPos) == 0)
*(dedyData + goldPos) = 0;
else else
for(int j = 0; j < blockSize; j++) { *(dedyData + goldPos) = -(*(weightData + k)) *
*(dedyData + beg + j) = -(*(weightData + j)) * (*(goldData + goldPos)) /
(*(goldData + beg + j)) / (*(outputData + goldPos));
(*(outputData + beg + j));
} }
} }
} }
} }
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp;
}
else {
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
} }
//if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -26,80 +26,20 @@ ...@@ -26,80 +26,20 @@
#include "../XDevice.h" #include "../XDevice.h"
#include "CrossEntropy.cuh" #include "CrossEntropy.cuh"
#include "CrossEntropy.h" #include "CrossEntropy.h"
#include "../core/reduce/ReduceSumAll.h" #include "../core/arithmetic/Div.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h" #include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/shape/Transpose.h"
#include "../core/shape/Unsqueeze.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
compute the cross entropy loss (cuda kernel)
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> lossData - the data pointer of loss tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> stride - the size of a data block
*/
__global__
void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
DTYPE * lossData, DTYPE * weightData,
DTYPE * paddingData, int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
DTYPE tmpLoss = 0;
if(weightData == NULL) {
if(paddingData == NULL) {
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
}
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
}
}
}
/*
compute the cross entropy loss (cuda version) compute the cross entropy loss (cuda version)
loss = sum_{i} (-gold_i * log(output_i)) loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions where gold and output are distributions
...@@ -115,76 +55,24 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -115,76 +55,24 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight, XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim) const XTensor * padding, int leadingDim)
{ {
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1,
"Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
//GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks); XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks); XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
dim3 blocks(cudaGrids[0], cudaGrids[1]); _Log(output, interBuf1);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); _Multiply(gold, interBuf1, interBuf2);
int devIDBackup; if(weight != NULL)
ProtectCudaDev(output->devID, devIDBackup); _MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DTYPE * outputData = (DTYPE*)output->data; if(padding != NULL)
DTYPE * goldData = (DTYPE*)gold->data; _MultiplyMe(loss, padding);
DTYPE * lossData = (DTYPE*)loss->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
BacktoCudaDev(output->devID, devIDBackup);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
} }
/* /*
...@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i]; dimSize[i - 1] = output->dimSize[i];
} }
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem); XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
_CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim); _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossInter); loss = _ReduceSumAll(lossBuf);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; int nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossInter->unitNum; nonZeroNum = lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp; DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / (DTYPE)nonZeroNum;
} }
else if(reduceWay == REDUCE_SUM) {
return loss; /* don't need to do anything */
}
/*
backward computation of cross entropy function (kernel version)
>> dedyData - the data pointer of dedy tensor
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> blockSize - the size of a data block
*/
__global__
void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
DTYPE * weightData, DTYPE * paddingData,
int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
if(weightData == NULL) {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
} }
else { else {
if(*(paddingData + i) == 0) ShowNTErrors("TODO");
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++) {
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
} }
delete[] dimSize;
DelTensorBuf(lossBuf);
return loss;
} }
/* /*
...@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output ...@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
*/ */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight, const XTensor * gold, const XTensor * weight,
const XTensor * padding, int leadingDim) XTensor * padding, int leadingDim)
{ {
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int devIDBackup;
ProtectCudaDev(output->devID, devIDBackup);
DTYPE * dedyData = (DTYPE*)dedy->data;
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
_Div(gold, output, dedy);
_NegateMe(dedy);
if(weight != NULL)
_MultiplyDimMe(dedy, weight, n);
if(padding != NULL) { if(padding != NULL) {
XTensor * tmp = NewTensor(padding); int paddingOrder = padding->order;
_IsNonZero(padding, tmp); int * paddingDims = new int[paddingOrder];
int nonZeroNum = (int)_ReduceSumAll(tmp); memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); padding->Reshape(padding->unitNum);
delete tmp;
} int order = dedy->order;
else { int * dims = new int[order];
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum); memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
} dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
_MultiplyDimMe(dedy, padding, 0);
BacktoCudaDev(output->devID, devIDBackup);
padding->Reshape(paddingOrder, paddingDims);
dedy->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
//if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
} }
......
...@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
/* backward computation of cross entropy function */ /* backward computation of cross entropy function */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL, const XTensor * gold, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1); XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
const XTensor * padding = NULL, int leadingDim = -1); const XTensor * padding = NULL, int leadingDim = -1);
/* backward computation of cross entropy function */ /* backward computation of cross entropy function */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * weight = NULL, const XTensor * padding = NULL, const XTensor * gold, const XTensor * weight = NULL,
int leadingDim = -1); XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -280,7 +280,7 @@ better numerical stability. ...@@ -280,7 +280,7 @@ better numerical stability.
*/ */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!"); CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
...@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int leadDimRDI = y->order - leadDim - 1; int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (gold->devID >= 0) { if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return; return;
} }
#endif #endif
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "LogSoftmax.h" #include "LogSoftmax.h"
#include "LogSoftmax.cuh" #include "LogSoftmax.cuh"
#include "Loss.cuh" #include "Loss.cuh"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/reduce/ReduceSum.cuh" #include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh" #include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h" #include "../XDevice.h"
...@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx ...@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
__global__ __global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName) void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
int size, LOSS_FUNCTION_NAME lossName)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -372,9 +374,11 @@ better numerical stability. ...@@ -372,9 +374,11 @@ better numerical stability.
*/ */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
leadDim = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs."); CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID), CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU."); "Tensors used in log softmax are not on the same GPU.");
...@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dimensionSize * stride, lossName); dimensionSize * stride, lossName);
} }
} }
if(padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -38,7 +38,7 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, ...@@ -38,7 +38,7 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -39,7 +39,7 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim); ...@@ -39,7 +39,7 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */ /* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
for (int j = 0; j < stride; j++) { for (int j = 0; j < stride; j++) {
for (int k = 0; k < tLen; k++) { for (int k = 0; k < tLen; k++) {
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize *(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) =
+ j + stride * (tBeg + k)) / (DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k)); -(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) /
(DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
} }
} }
} }
......
...@@ -175,7 +175,7 @@ See more details in LogSoftmaxBackward(...) ...@@ -175,7 +175,7 @@ See more details in LogSoftmaxBackward(...)
*/ */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!"); CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
...@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA #ifdef USE_CUDA
if(y->devID >= 0){ if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return; return;
} }
#endif #endif
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "Loss.cuh" #include "Loss.cuh"
#include "../core/reduce/ReduceSum.h" #include "../core/reduce/ReduceSum.h"
#include "../core/arithmetic/Multiply.h" #include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/shape/Unsqueeze.h" #include "../core/shape/Unsqueeze.h"
#include "../core/arithmetic/Sum.h" #include "../core/arithmetic/Sum.h"
#include "../XDevice.h" #include "../XDevice.h"
...@@ -309,9 +310,11 @@ See more details in SoftmaxBackward ...@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
*/ */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
int n = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs."); CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU."); CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((y->order >= 1), "Empty tensor!"); CheckNTErrors((y->order >= 1), "Empty tensor!");
...@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){ if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F); _Sum(y, gold, dedx, -1.0F);
if(padding != NULL) {
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
} }
else if(lossName == ONEHOTERROR){ else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -38,7 +38,7 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -38,7 +38,7 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -36,7 +36,7 @@ XTensor Softmax(const XTensor &x, int leadDim); ...@@ -36,7 +36,7 @@ XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */ /* de/dx */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -169,8 +169,8 @@ bool TestDropout2() ...@@ -169,8 +169,8 @@ bool TestDropout2()
_DropoutBackward(y, x, dedy, dedx, 1, dropProb); _DropoutBackward(y, x, dedy, dedx, 1, dropProb);
/* check result */ /* check result */
y->Dump(stderr, "y"); //y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy"); //dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -193,8 +193,8 @@ bool TestDropout2() ...@@ -193,8 +193,8 @@ bool TestDropout2()
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb); _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);
/* check result */ /* check result */
yGPU->Dump(stderr, "yGPU"); //yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU"); //dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */ /* destroy variables */
delete x; delete x;
......
...@@ -146,7 +146,7 @@ bool TestLogSoftmax2() ...@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
_LogSoftmax(x, y, 1); _LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY); _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -174,7 +174,7 @@ bool TestLogSoftmax2() ...@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
_LogSoftmax(xGPU, yGPU, 1); _LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY); _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F); gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
...@@ -250,7 +250,7 @@ bool TestLogSoftmax3() ...@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
_LogSoftmax(x, y, 1); _LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR); _LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -278,7 +278,7 @@ bool TestLogSoftmax3() ...@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
_LogSoftmax(xGPU, yGPU, 1); _LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */ /* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR); _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
...@@ -66,7 +66,9 @@ bool TestPower1() ...@@ -66,7 +66,9 @@ bool TestPower1()
bUser = Power(*a, 2.0F); bUser = Power(*a, 2.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -88,7 +90,9 @@ bool TestPower1() ...@@ -88,7 +90,9 @@ bool TestPower1()
bUserGPU = Power(*aGPU, 2.0F); bUserGPU = Power(*aGPU, 2.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -153,7 +157,9 @@ bool TestPower2() ...@@ -153,7 +157,9 @@ bool TestPower2()
bUser = Power(*a, 1.0F); bUser = Power(*a, 1.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -175,7 +181,9 @@ bool TestPower2() ...@@ -175,7 +181,9 @@ bool TestPower2()
bUserGPU = Power(*aGPU, 1.0F); bUserGPU = Power(*aGPU, 1.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -214,7 +222,7 @@ bool TestPower3() ...@@ -214,7 +222,7 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 1.0F}, DTYPE aData[3][2] = { {1.0F, 1.0F},
{2.0F, 3.0F}, {2.0F, 3.0F},
{4.0F, 5.0F} }; {4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0F, 1.0F}, DTYPE answer[3][2] = { {1.0F, 1.0F},
...@@ -240,7 +248,9 @@ bool TestPower3() ...@@ -240,7 +248,9 @@ bool TestPower3()
bUser = Power(*a, 0.0F); bUser = Power(*a, 0.0F);
/* check results */ /* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F); cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -262,7 +272,9 @@ bool TestPower3() ...@@ -262,7 +272,9 @@ bool TestPower3()
bUserGPU = Power(*aGPU, 0.0F); bUserGPU = Power(*aGPU, 0.0F);
/* check results */ /* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F); gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete a; delete a;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/ */
#include "TReduceSum.h" #include "TReduceSum.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -155,6 +156,457 @@ bool TestReduceSum1() ...@@ -155,6 +156,457 @@ bool TestReduceSum1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B >= 128
(50, 1000000) -> (50), dim = 1
*/
bool TestReduceSum2()
{
/* a tensor of size (50, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B < 128
(1000000, 50) -> (1000000), dim = 1
*/
bool TestReduceSum3()
{
/* a tensor of size (1000000, 50) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 1000000;
sDimSize[1] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (1000000) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 1000000;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A < 10, B is free
(5, 1000000) -> (5), dim = 1
*/
bool TestReduceSum4()
{
/* a tensor of size (5, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 5;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (5) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 5;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C > 4096
(500, 1000, 500) -> (500, 500), dim = 1
*/
bool TestReduceSum5()
{
/* a tensor of size (500, 1000, 500) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 500;
sDimSize[1] = 1000;
sDimSize[2] = 500;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (500, 500) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C <= 4096
(50, 10000, 50) -> (50, 50), dim = 1
*/
bool TestReduceSum6()
{
/* a tensor of size (50, 10000, 50) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 10000;
sDimSize[2] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50, 50) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -175,6 +627,51 @@ bool TestReduceSum() ...@@ -175,6 +627,51 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceSum2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* case 3 test */
//caseFlag = TestReduceSum3();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 3 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestReduceSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
///* case 5 test */
//caseFlag = TestReduceSum5();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 5 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestReduceSum6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -146,7 +146,7 @@ bool TestSoftmax2() ...@@ -146,7 +146,7 @@ bool TestSoftmax2()
_Softmax(x, y, 1); _Softmax(x, y, 1);
/* call SoftmaxBackward function */ /* call SoftmaxBackward function */
_SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY); _SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
...@@ -174,7 +174,7 @@ bool TestSoftmax2() ...@@ -174,7 +174,7 @@ bool TestSoftmax2()
_Softmax(xGPU, yGPU, 1); _Softmax(xGPU, yGPU, 1);
/* call SoftmaxBackward function */ /* call SoftmaxBackward function */
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY); _SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
...@@ -20,8 +20,9 @@ ...@@ -20,8 +20,9 @@
*/ */
#include "TSumDim.h" #include "TSumDim.h"
#include "../core/arithmetic/SumDim.h"
#include "../XTensor.h" #include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -251,6 +252,225 @@ bool TestSumDim2() ...@@ -251,6 +252,225 @@ bool TestSumDim2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim3()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
*/
bool TestSumDim4()
{
/* a tensor of size (200, 40, 4000) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 1000000;
aDimSize[1] = 50;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 50;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -280,6 +500,24 @@ bool TestSumDim() ...@@ -280,6 +500,24 @@ bool TestSumDim()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSumDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
///* case 4 test */
//caseFlag = TestSumDim4();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 4 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论