Commit 03a9836e by xuchen

1. add some base functions 2.better implementation for t2t

parent 52c0e35a
......@@ -49,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
......@@ -58,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else{
ShowNTErrors("Wrong activation function type!");
......
......@@ -42,7 +42,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName)
{
......@@ -58,7 +58,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
......@@ -67,7 +67,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
......@@ -83,10 +83,12 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
_LossBackward(dedy, gold, y, lossName);
//_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
}
}
\ No newline at end of file
......@@ -36,13 +36,13 @@ class XLossGrad
public:
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName);
};
......
......@@ -469,8 +469,6 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
DelTensorBuf(b);
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......
......@@ -55,7 +55,7 @@ void XNetClearAll()
XNet::XNet()
{
nodes.Clear();
isGradEfficient = true;
isGradEfficient = false;
}
/* de-constructor */
......@@ -86,7 +86,31 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
XList golds(1);
golds.Add(&gold);
Backward(roots, golds, loss);
XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
{
XList roots(1);
roots.Add(&root);
XList golds(1);
golds.Add(&gold);
XList paddings(1);
paddings.Add(&padding);
Backward(roots, golds, paddings, loss);
}
/*
......@@ -102,7 +126,10 @@ void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList golds(1);
golds.Add(NULL);
Backward(roots, golds, loss);
XList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
......@@ -110,9 +137,10 @@ backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> gold - a list of gold standard for the output
>> padding - specify a target value that is ignored
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
{
Traverse(roots);
......@@ -131,6 +159,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income;
int funcID = income.typeID;
void * params = income.params;
......@@ -139,15 +168,21 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
root->visitMark = NODE_FINISHED;
if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED;
}
else {
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, padding, loss);
}
}
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
else{
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, loss);
lossGrad.Compute(gold, root, root->grad, NULL, loss);
}
}
......@@ -178,16 +213,35 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
/*
backward propagation to obtain gradient
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> roots - a list of root nodes (output) of the network
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
{
XList golds(roots.count);
for(int i = 0; i < roots.count; i++)
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++) {
golds.Add(NULL);
paddings.Add(NULL);
}
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
{
XList paddings(roots.count);
for(int i = 0; i < roots.count; i++)
paddings.Add(NULL);
Backward(roots, golds, loss);
Backward(roots, golds, paddings, loss);
}
/*
......
......@@ -62,17 +62,24 @@ struct XNet
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false);
......
......@@ -514,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if(isEnd)
break;
Test(testFN, outputFN, model);
}
double elapsed = GetClockSec() - startT;
......@@ -890,7 +892,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttDecoder::AttDecoder()
{
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor res;
XTensor nothing;
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
/* layer normalization */
x = attEndeLayerNorms[i].Make(res);
/*******/
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
return x;
}
}
......@@ -22,19 +22,33 @@
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#include "T2TEncoder.h"
namespace transformer
{
class T2TDecoder
class AttDecoder : public AttEncoder
{
public:
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
};
class AttDecoder : T2TDecoder
{
/* layer normalization for encoder-decoder attention */
T2TLN * attEndeLayerNorms;
public:
/* constructor */
AttDecoder();
/* deconstructor */
~AttDecoder();
/* initialize the model */
void InitModel(int argc, char ** argv);
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
};
}
......
......@@ -61,16 +61,17 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRand(-v, v);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
}
/*
make positional embeddings (of size eSize * length
eSize - embedding size
length - length of the sequenc
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{
......@@ -114,15 +115,15 @@ make the network
*/
XTensor T2TEmbedder::Make(XTensor &input)
{
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
//CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize;
dims[input.order] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
......@@ -138,7 +139,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
......@@ -148,7 +150,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
}
/* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
//wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
......
......@@ -31,6 +31,10 @@ namespace transformer
/* constructor */
AttEncoder::AttEncoder()
{
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
}
/* de-constructor */
......
......@@ -59,10 +59,7 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
InitTensor1D(&w, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / d);
w.SetDataRand(-finfout, finfout);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
}
......
......@@ -57,8 +57,8 @@ void T2TModel::InitModel(int argc, char ** argv)
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamBool(argc, argv, "lm", &isLM, !isMT);
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
......@@ -71,6 +71,9 @@ void T2TModel::InitModel(int argc, char ** argv)
encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem);
if(isMT)
decoder.InitModel(argc, argv, true, 0, devID, mem);
XList params(10);
GetParams(params);
......@@ -87,74 +90,161 @@ make the encoding network
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining)
XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
{
return encoder.Make(input, mask, isTraining);
}
/*
make the entire network (with the output softmax layer)
make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
return decoder.Make(inputDec, outputEnc, mask, isTraining);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{
XTensor encoding;
if(isLM){
/* generate mask to see "previous" words only */
int len = input.GetDim(input.order - 2);
int * dims = new int[input.order + 1];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
dims[0] = nhead;
dims[input.order] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* generate mask to see "previous" words only */
//int len = input.GetDim(input.order - 2);
//int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
dims[0] = nhead;
dims[input.order + 1] = len;
XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
/* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.denseRatio, padding.devID, padding.mem);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining);
outputLayer.Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
/* generate mask to see "previous" words on the decoder side */
int len = inputDec.GetDim(inputDec.order - 2);
int * dims = new int[inputDec.order + 1];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order] = len;
InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
_Sum(&mask, padding3, &mask);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
encoding = MakeEncoding(input, mask, isTraining);
outputLayer.Make(encoding, output);
/* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding2);
DelTensorBuf(padding3);
}
else{
ShowNTErrors("TODO!");
}
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
outputLayer.Make(decoding, output);
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
......@@ -180,8 +270,33 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder.attLayerNorms[i].w);
list.Add(&encoder.attLayerNorms[i].b);
}
list.Add(&encoder.embedder.w);
if(isMT){
for(int i = 0; i < decoder.nlayer; i++){
list.Add(&decoder.fnns[i].w1);
list.Add(&decoder.fnns[i].b1);
list.Add(&decoder.fnns[i].w2);
list.Add(&decoder.fnns[i].b2);
list.Add(&decoder.attentionsEnde[i].wk);
list.Add(&decoder.attentionsEnde[i].wq);
list.Add(&decoder.attentionsEnde[i].wv);
list.Add(&decoder.attentionsEnde[i].wa);
list.Add(&decoder.attEndeLayerNorms[i].w);
list.Add(&decoder.attEndeLayerNorms[i].b);
list.Add(&decoder.attentions[i].wk);
list.Add(&decoder.attentions[i].wq);
list.Add(&decoder.attentions[i].wv);
list.Add(&decoder.attentions[i].wa);
list.Add(&decoder.fnnLayerNorms[i].w);
list.Add(&decoder.fnnLayerNorms[i].b);
list.Add(&decoder.attLayerNorms[i].w);
list.Add(&decoder.attLayerNorms[i].b);
}
list.Add(&decoder.embedder.w);
}
}
/*
......
......@@ -69,10 +69,16 @@ public:
void InitModel(int argc, char ** argv);
/* make the encoding network */
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);
/* get parameter matrics */
void GetParams(XList &list);
......
......@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
DTYPE v = 1.0F/(float)sqrt((float)hSize);
w.SetDataRandn(0, v);
}
/*
......@@ -90,7 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{
XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1);
//output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
}
}
......@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
LoadParamBool(argc, argv, "adam", &useAdam, false);
......@@ -113,6 +114,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, false);
buf = new int[bufSize];
buf2 = new int[bufSize];
......@@ -122,6 +124,9 @@ void T2TTrainer::Init(int argc, char ** argv)
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
validStep = 0;
curEpoch = 0;
}
int tc = 0;
......@@ -133,9 +138,10 @@ train the model
>> modelFN - where we keep the model
>> model - model to train
*/
void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
bool T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
{
int epoch = 0;
curEpoch += 1;
int step = 0;
int wc = 0;
int wordCount = 0;
......@@ -147,7 +153,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int nCheckpoint = 0;
int nSkipped = 0;
int gradStep = 0;
int validStep = 0;
//int validStep = 0;
char * trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn);
......@@ -157,18 +163,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
sprintf(trainFN, "%s.random", fn);
#endif
PrepareModel(model);
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
PrepareModel(model);
double startT = GetClockSec();
for(epoch = 1; epoch <= nepoch; epoch++){
//for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32
if(isShuffled)
Shuffle(fn, trainFN);
if(isShuffled)
Shuffle(fn, trainFN);
#endif
FILE * file = fopen(trainFN, "rb");
......@@ -177,11 +183,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
wordCount = 0;
loss = 0;
/* batch of input sequences */
XTensor batch;
/* batch of sequences (on the encoder and decoder sides) */
XTensor batchEnc;
XTensor batchDec;
/* padding */
XTensor padding;
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
......@@ -189,26 +197,40 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */
XTensor goldSmoothed;
while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) {
while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, wc, devID, mem, true))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
/* output probabilities */
XTensor output;
/* make the network */
model->Make(batch, output, padding, true);
if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, true);
else{
ShowNTErrors("Illegal model type!");
}
/* back-propagation for obtaining gradients */
if (labelSmoothingP > 0)
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */
if (output.GetDim(0) > 1)
PadOutput(&output, &gold, &padding);
//if (output.GetDim(0) > 1)
// PadOutput(&output, &gold, &paddingDec);
//output.Dump(tmpFILE, "output: ");
//fflush(tmpFILE);
/* get probabilities */
float prob = GetProb(&output, &gold, NULL);
DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
......@@ -217,18 +239,11 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (doUpdate) {
/* recale the output for normalized loss */
RescaleOutput(&output, &g, &padding);
//RescaleOutput(&output, &g, &paddingDec);
/* back-propagation */
net.Backward(output, g, CROSSENTROPY);
/*for(int i = 0; i < net.nodes.count; i++){
XTensor * node = (XTensor*)net.nodes.Get(i);
XLink::ShowNode(stderr, node);
}
exit(0);*/
net.Backward(output, g, paddingDec, CROSSENTROPY);
gradStep += 1;
loss += -prob;
wordCount += wc;
......@@ -255,10 +270,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break;
}
if (step % 1 == 0) {
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
lr, elapsed, step, curEpoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
if (!doUpdate)
XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n");
......@@ -274,20 +289,20 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
fclose(file);
if (isEnd)
break;
if(useEpochCheckpoint)
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, nepoch);
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
elapsed, step, nSkipped, epoch);
return false;
return true;
//if(useEpochCheckpoint)
// MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
//}
//double elapsed = GetClockSec() - startT;
//
//epoch = MIN(epoch, nepoch);
//
//XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
// lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
//XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
// elapsed, step, nSkipped, epoch);
delete[] trainFN;
}
......@@ -322,10 +337,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
wordCount = 0;
/* batch of input sequences */
XTensor batch;
XTensor batchEnc;
XTensor batchDec;
/* padding */
XTensor padding;
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
......@@ -335,18 +352,28 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){
while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold,
seqs, vSize, vSizeTgt,
1, 1, false, wc, devID, mem, false))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
//CheckNTErrors(batchEnc.order == 3, "wrong tensor order of the sequence batch");
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */
XTensor output;
/* make the network */
model->Make(batch, output, padding, false);
if(model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false);
else if(model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, false);
else{
ShowNTErrors("Illegal model type!");
}
int bSize = batch.GetDim(0);
int length = batch.GetDim(1);
int bSize = output.GetDim(0);
int length = output.GetDim(1);
/* prediction probabilities */
XTensor probs;
......@@ -391,7 +418,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss / wordCount));
}
......@@ -511,6 +538,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
/* sort the sequences by length */
if (isSorted) {
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
SampleNode * nodes = new SampleNode[seqCount];
int count = 0;
int offset = 0;
......@@ -526,19 +554,18 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
offset += node.size;
}
qsort(nodes, seqCount, sizeof(SampleNode), CompareSampleNode);
qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
count = 0;
offset = 0;
for(int i = 0; i < seqCount; i++){
for(int i = 0; i < seqCount; i += step){
SampleNode &node = nodes[count];
//fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for(int j = 0; j < step; j++){
seqLen2[count + j] = seqLen[node.id + j];
seqOffset[count + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
seqLen2[i + j] = seqLen[node.id + j];
seqOffset[i + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
}
count += step;
count += 1;
offset += node.size;
}
......@@ -546,6 +573,7 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
buf = buf2;
buf2 = tmp;
tmp = seqLen;
seqLen = seqLen2;
seqLen2 = tmp;
......@@ -562,32 +590,79 @@ void T2TTrainer::ClearBuf()
nextSeq = -1;
}
/*
/*
load a batch of sequences
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences
>> padding - padding of the input sequences
>> output - the batch of the output sequences
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> step - the step we go over when move to the next sequence
>> vs - vocabulary size
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int step, int vs, int sBatch, int wBatch,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem)
int devID, XMem * mem,
bool isTraining)
{
if(isLM){
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
else{
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, wCount, devID, mem, isTraining);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, step);
LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0);
int wc = 0;
......@@ -604,7 +679,8 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(max < wn)
max = wn;
if(sc >= sBatch && wc >= wBatch)
int tc = isSmallBatch ? max * sc : wc;
if(sc >= sBatch && tc >= wBatch)
break;
}
......@@ -614,74 +690,205 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(sc <= 0)
return 0;
if(isLM){
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vs;
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(batch->grad == NULL)
XNoder::MakeGrad(batch);
else
InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(padding->grad == NULL)
XNoder::MakeGrad(padding);
else
InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem);
if(output->grad == NULL)
XNoder::MakeGrad(output);
else
InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
batch->SetZeroAll();
padding->SetZeroAll();
output->SetZeroAll();
batch->grad->SetZeroAll();
padding->grad->SetZeroAll();
output->grad->SetZeroAll();
int seqSize = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
/* this might be slow on GPUs :( */
for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
padding->Set2D(1.0F, s - seq, w);
if(w > 0)
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
if(w == len - 1){
if(isDoubledEnd)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vs;
InitTensor(batchEnc, 2, dims, X_INT, 1.0F, -1);
//InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
gold->SetZeroAll();
paddingDec->SetZeroAll();
if(isTraining) {
//XNoder::MakeGrad(batchEnc);
XNoder::MakeGrad(paddingEnc);
XNoder::MakeGrad(gold);
XNoder::MakeGrad(paddingDec);
//batchEnc->grad->SetZeroAll();
paddingEnc->grad->SetZeroAll();
gold->grad->SetZeroAll();
paddingDec->grad->SetZeroAll();
}
int seqSize = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
/* this might be slow on GPUs :( */
for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
//batchEnc->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
paddingEnc->Set2D(1.0F, s - seq, w);
paddingDec->Set2D(1.0F, s - seq, w);
if (w > 0)
gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
if (w == len - 1) {
if (isDoubledEnd)
gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
}
if(seqs != NULL){
for(int w = len; w < max; w++)
seqs[seqSize++] = -1;
wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if(seqs != NULL){
for(int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
}
fflush(tf);
return sc;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
int T2TTrainer::LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
/* target-side sequence */
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
wnDec = len;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
int tc = isSmallBatch ? maxEnc * sc / 2 : wcEnc;
if(sc >= sBatch * 2 && tc >= wBatch)
break;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;
int sCount = sc/2;
int seqSize = 0;
int dimsEnc[3] = {sCount, maxEnc, vsEnc};
int dimsDec[3] = {sCount, maxDec, vsDec};
InitTensor(batchEnc, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
InitTensor(batchDec, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
gold->SetZeroAll();
wCount = 0;
/* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq)/2;
for(int w = 0; w < len; w++){
batchEnc->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
paddingEnc->Set2D(1.0F, sent, w);
wCount++;
}
}
/* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= maxDec, "Something is wrong!");
int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){
paddingDec->Set2D(1.0F, sent, w);
batchDec->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
if(w > 0)
gold->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
if (w == len - 1) {
if(isDoubledEnd)
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
else
gold->Set3D(1.0F, sent, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
fflush(tf);
if(seqs != NULL){
for(int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
}
return sc;
......@@ -715,8 +922,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
XTensor probs;
InitTensor(&probs, output);
XTensor logOutput;
InitTensor(&logOutput, output);
_Log(output, &logOutput);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(output, gold, &probs);
_Multiply(&logOutput, gold, &probs);
/* probability of each word */
XTensor wprobs;
......@@ -730,7 +941,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
_CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
TODO: XTensor supports scalars */
dims[0] = 1;
dims[1] = probs.unitNum;
probs.Reshape(2, dims);
......@@ -885,18 +1096,13 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
{
CheckNTErrors(output->order == 3, "Wrong dimension number!");
CheckNTErrors(gold->order == 3, "Wrong dimension number!");
int num = padding->GetDim(0);
XTensor * factor = NewTensorBuf(1, &num, padding->dataType, 1.0F, padding->devID, padding->mem);
_ReduceSum(padding, factor, padding->order - 1);
DTYPE count = _ReduceSumAll(padding);
_ExpMe(output);
_DivDim(output, factor, output, 0);
_ScaleAndShiftMe(output, 1/count);
_LogMe(output);
_DivDim(gold, factor, gold, 0);
DelTensorBuf(factor);
_ScaleAndShiftMe(gold, 1/count);
}
/*
......
......@@ -79,6 +79,9 @@ public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */
float lrate;
......@@ -100,6 +103,10 @@ public:
/* indicates whether we use adam */
bool useAdam;
int validStep;
int curEpoch;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
......@@ -128,8 +135,13 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we double the </s> symble for the output of lms */
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
public:
/* constructor */
......@@ -142,7 +154,7 @@ public:
void Init(int argc, char ** argv);
/* train the model */
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
bool Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
......@@ -158,11 +170,34 @@ public:
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int step, int vs, int sBatch, int wBatch,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
......
......@@ -25,6 +25,8 @@
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer
{
......@@ -56,20 +58,74 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
if(strcmp(trainFN, "")) {
double startT = GetClockSec();
T2TTrainer trainer;
trainer.Init(argc, args);
char * fn = new char[MAX_LINE_LENGTH];
char * fn1 = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
modelFN = strcmp(modelFN, "") ? modelFN : (char *)"checkpoint.model";
int epoch;
bool isTrain;
for(epoch = 1; epoch <= trainer.nepoch; epoch++) {
sprintf(fn, "%s.%s.%03d", modelFN, "epoch", epoch - 1);
sprintf(fn1, "%s.%s.%03d", modelFN, "epoch", epoch);
sprintf(fn2, "%s.%s.%03d.output", modelFN, "epoch", epoch);
if(epoch == 1) {
T2TModel model;
model.InitModel(argc, args);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
else {
T2TModel model;
model.InitModel(argc, args);
model.Read(fn);
isTrain = trainer.Train(trainFN, testFN, modelFN, &model);
model.Dump(fn1);
}
if(trainer.useEpochCheckpoint && strcmp(testFN, "")) {
T2TTrainer tester;
tester.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
model.Read(fn1);
tester.Test(testFN, fn2, &model);
}
if(!isTrain)
break;
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, trainer.nepoch);
XPRINT2(0, stderr, "[INFO] training finished (took %.1fs and epoch=%d)\n", elapsed, epoch);
delete[] fn;
delete[] fn1;
delete[] fn2;
}
/* don't dump the final model */
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
//if(strcmp(modelFN, "") && strcmp(trainFN, ""))
// model.Dump(modelFN);
T2TModel model;
model.InitModel(argc, args);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
......
......@@ -446,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] &&
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
blockSize[0] = bXSize;
blockSize[1] = bYSize;
......
......@@ -292,7 +292,8 @@ void XMem::SetComputationMode(bool myIsForComputation)
if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
cublasDestroy(cublasHandle);
if(myIsForComputation)
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, "Cannot create the cublas handle.");
CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
SetDevice(devIDBackup);
#endif
......@@ -1392,8 +1393,8 @@ void XMem::CreateBLASHandle()
"Cannot destroy the cublas handle.");
}
CheckNTErrors(cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS,
"Cannot create the cublas handle.");
#endif
}
......
......@@ -1057,9 +1057,9 @@ int XTensor::GetKeyInSparse(int i)
/*
set the value of a cell
>> value - value to assign to the cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>>
>> size - size of the index
*/
bool XTensor::Set(DTYPE value, int index[], int size)
{
......@@ -1070,8 +1070,9 @@ bool XTensor::Set(DTYPE value, int index[], int size)
/*
set the value of a cell in a 1d tensor
>> value - value to assign to the cell
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1D(DTYPE value, int i)
{
......@@ -1124,6 +1125,78 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
return SetToDevice(devID, GetCell(dims, 3), value);
}
/*
set the integer value of a cell
>> value - value we tend to set
>> index - index of the cell for each dimension
>> size - size of the index
<< return - succeeded or not
*/
bool XTensor::SetInt(int value, int index[], int size)
{
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
return SetToDeviceInt(devID, GetCell(index, size), value);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1DInt(int value, int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[1] = {i};
return SetToDeviceInt(devID, GetCell(dims, 1), value);
}
/*
set the integer value of a cell in a 2d tensor in default type
>> value - value we tend to set
>> ni - row index
>> mi - column index
<< return - succeeded or not
*/
bool XTensor::Set2DInt(int value, int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[2] = {ni, mi};
return SetToDeviceInt(devID, GetCell(dims, 2), value);
}
/*
set the integer value of a cell in a 3d tensor in default type
>> value - value we tend to set
>> d0 - index of demension 0
>> d1 - index of demension 1
>> d2 - index of demension 2
<< return - succeeded or not
*/
bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");
int dims[3] = {d0, d1, d2};
return SetToDeviceInt(devID, GetCell(dims, 3), value);
}
/*
increase the value of a cell in a 2d tensor
>> value - value we tend to set
......@@ -1986,6 +2059,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
if (tensor->unitNum * tensor->unitSize == 176657664) {
tensor->Dump(stderr, "", 200);
}
if(myMem != NULL)
tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
else
......
......@@ -326,6 +326,18 @@ public:
/* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
/* set the integer value of a cell in a 2d tensor */
bool Set2DInt(int value, int ni, int mi);
/* set the integer value of a cell in a 3d tensor */
bool Set3DInt(int value, int d0, int d1, int d2);
/* increase the value of a cell in a 2d */
bool Add2D(DTYPE value, int ni, int mi);
......
......@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return true;
}
/* assign a integer number to a variable that is kept on a specified device */
bool SetToDeviceInt(int devID, void * p, int value)
{
if(p == NULL)
return false;
if(devID < 0)
*(int*)p = value;
else{
XMemCopy(p, devID, &value, -1, sizeof(int));
}
return true;
}
/* get the next number with power of 2 */
unsigned int GetNextPower2(unsigned int n)
{
......
......@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value);
extern bool SetToDeviceInt(int devID, void * p, int value);
extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime);
extern double GetClock();
......
......@@ -70,9 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize;
}
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a);
DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
tensor->SetDataRand(-finfout, finfout);
//_SetDataRand(tensor, -finfout, finfout);
}
/*
......@@ -393,7 +393,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if(tensor == NULL)
return;
/* GPU code */
/* CPU code */
if(tensor->devID < 0){
DTYPE variance = upper - lower;
......
......@@ -21,6 +21,8 @@
#include "Gather.h"
#include "CopyIndexed.h"
#include "../../XUtility.h"
#include "../shape/Reshape.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -75,4 +77,50 @@ XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
return result;
}
/*
gather indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor(2D)
>> index - the index tensor
<< return - the result of copying indexed sub-tensors
*/
XTensor Gather(const XTensor &s, const XTensor &index)
{
int indexSize = index.unitNum;
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int * srcIndex = new int[index.unitNum];
if(index.dataType == X_INT) {
XMemCopy(srcIndex, -1, index.data, index.devID, indexSize * index.unitSize);
}
else if(index.dataType == X_FLOAT || index.dataType == X_DOUBLE) {
DTYPE * tmp = new DTYPE[indexSize];
XMemCopy(tmp, -1, index.data, index.devID, indexSize * index.unitSize);
for(int i = 0; i < indexSize; i++)
srcIndex[i] = (int)tmp[i];
delete[] tmp;
}
XTensor tensor;
tensor = Gather(s, 0, srcIndex, indexSize);
delete[] srcIndex;
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = tensor.GetDim(-1);
XTensor t;
t = Reshape(tensor, index.order + 1, dims);
delete[] dims;
return t;
}
else {
return tensor;
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -33,6 +33,10 @@ void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexS
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(const XTensor &s, const XTensor &index);
} // namespace nts(NiuTrans.Tensor)
#endif // __GATHER_H__
\ No newline at end of file
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include <math.h>
#include "ReduceSum.h"
......
......@@ -105,15 +105,15 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
if(i >= stride * blockNum)
return;
if(threadIdx.y == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
if(threadIdx.x == 0)
bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads();
......@@ -121,7 +121,7 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
int iOffset = i % stride;
bool isValid = (i < stride * blockNum && j < strideNum);
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.x] : 0;
DTYPE value = isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;
if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0)
......@@ -136,21 +136,20 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
value = exp(value);
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
iData[threadIdx.y * blockDim.x + threadIdx.x] = value;
__syncthreads();
/* do reduction in shared mem */
for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
if (threadIdx.y < s)
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (threadIdx.x < s)
iData[idx] += iData[idx + s];
__syncthreads();
}
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
}
/*
......@@ -282,15 +281,15 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
if(i >= stride * blockNum)
return;
if (threadIdx.y == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
if (threadIdx.x == 0)
bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
__syncthreads();
......@@ -299,17 +298,17 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
int iOffset = i % stride;
bool isValid = j < strideNum;
bool isValid2 = j + blockDim.y < strideNum;
bool isValid2 = j + blockDim.x < strideNum;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * data = iData + threadIdx.y * blockDim.x;
DTYPE * inputData = input + k * blockSize;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.x]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.y) * stride + iOffset] - bias[threadIdx.x]: 0;
DTYPE value = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
if(power != (DTYPE)1.0){
if(power == (DTYPE)2.0){
value = value * value;
value2 = value2 *value2;
value2 = value2 * value2;
}
else if(power == (DTYPE)0.5){
value = sqrt(value);
......@@ -329,17 +328,25 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
}
value = value + value2;
__syncthreads();
value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads();
if (tid < 32){
if (tid < blockDim.y / 32)
if (tid < blockDim.x / 32)
value = data[tid];
else value = 0;
value = shflDownReduceSum(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
else
value = 0;
value = shflDownReduceSum(value);
if (tid == 0 && blockIdx.x < reducedStrideNum) {
output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
}
}
}
......@@ -480,7 +487,7 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce
*/
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int blockNum, DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......@@ -568,7 +575,8 @@ void KernelReduceSumOp(DTYPE * input, DTYPE * output,
if (tid < 32){
if (tid < blockDim.y / 32)
threadSum = data[tid];
else threadSum = 0;
else
threadSum = 0;
threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
......@@ -640,29 +648,28 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
/*
this situation we use block.x * grid.x deal one vector for continuous read
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
{
block.x = 512;
block.y = 1;
block->x = 512;
block->y = 1;
if ((stride * blockNum) % 512 == 0)
grid.x = (stride * blockNum) / 512;
grid->x = (stride * blockNum) / 512;
else
grid.x = (stride * blockNum) / 512 + 1;
grid.y = 1;
grid->x = (stride * blockNum) / 512 + 1;
grid->y = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
{
if (threads.x > 1){
blocks.x *= threads.x;
threads.x = 1;
if (threads->y > 1){
blocks->y *= threads->y;
threads->y = 1;
}
if (threads.y < 32)
threads.y = 32;
if (threads->x < 32)
threads->x = 32;
}
/*
......@@ -724,7 +731,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
......@@ -733,19 +740,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
if (blockNum % 4 != 0)
blockNum = (int)(blockNum / 4) + 1;
else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp);
}
......@@ -769,50 +780,50 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
}
......
......@@ -44,23 +44,24 @@ sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor
<< return - the total summation
*/
DTYPE _ReduceSumAll(XTensor * source)
DTYPE _ReduceSumAll(const XTensor * source)
{
int order = source->order;
DTYPE summation;
XTensor * big = NewTensor(source);
_CopyValues(source, big);
for(int i = 0; i < order; i++) {
if(i == order - 1)
big->Reshape(big->unitNum, 1);
for(int i = order - 1; i >= 0; i--) {
if(i == 0)
big->Reshape(1, big->unitNum);
int leadingDim = big->order - 1;
int * dimSize;
dimSize = getDimSize(big, 0);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
_ReduceSum(big, little, 0);
_ReduceSum(big, little, leadingDim);
delete big;
delete dimSize;
......@@ -81,7 +82,7 @@ sum all the items of the tensor
>> source - the inpute tensor
<< return - the total summation
*/
DTYPE ReduceSumAll(XTensor & source)
DTYPE ReduceSumAll(const XTensor & source)
{
return _ReduceSumAll(&source);
}
......
......@@ -28,10 +28,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */
DTYPE _ReduceSumAll(XTensor * source);
DTYPE _ReduceSumAll(const XTensor * source);
/* sum all the items of the tensor */
DTYPE ReduceSumAll(XTensor & source);
DTYPE ReduceSumAll(const XTensor & source);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -50,46 +50,33 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
const XTensor * padding, int leadingDim)
{
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
/* l = log(output) */
_Log(output, logBuf);
if(weight != NULL){
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
}
/* negate result n = negate(mul) */
_NegateMe(mulBuf);
XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
_ReduceSum(mulBuf, loss, n);
_Log(output, interBuf1);
_Multiply(gold, interBuf1, interBuf2);
if(weight != NULL)
_MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DelTensorBuf(mulBuf);
DelTensorBuf(logBuf);
if(padding != NULL)
_MultiplyMe(loss, padding);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
}
/*
......@@ -109,19 +96,12 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim)
{
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!");
"Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
......@@ -133,6 +113,22 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1;
int blockSize = 1;
int stride = 1;
......@@ -148,31 +144,40 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * lossData = (DTYPE*)loss->data;
DTYPE tmpLoss;
int lossPos;
int goldPos;
if(weight == NULL) {
if(padding == NULL) {
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
for(int j = 0; j < stride; j++) {
tmpLoss = 0;
lossPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
tmpLoss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
if(*(paddingData + i) == 0)
*(lossData + i) = 0;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + lossPos) == 0)
*(lossData + lossPos) = 0;
else {
tmpLoss = 0;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
tmpLoss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
*(lossData + lossPos) = tmpLoss;
}
}
}
}
......@@ -181,30 +186,36 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) {
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
for(int j = 0; j < stride; j++) {
tmpLoss = 0;
lossPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
tmpLoss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
if(*(paddingData + i) == 0)
*(lossData + i) = 0;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
for(int j = 0; j < stride; j++) {
lossPos = i * stride + j;
if(*(paddingData + lossPos) == 0)
*(lossData + lossPos) = 0;
else {
tmpLoss = 0;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
tmpLoss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
*(lossData + lossPos) = tmpLoss;
}
}
}
}
......@@ -212,26 +223,6 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
}
/*
get the dimSize after reduce operation
>> tensor - a tensor to be reduced
>> n - the reduce dimension
<< return - the pointer of dimSize
*/
int * reduceDimSize(const XTensor * tensor, int n)
{
int order = tensor->order;
int * dimSize = new int[order - 1];
for (int i = 0; i < order; i++) {
if(i < n)
dimSize[i] = tensor->dimSize[i];
else if(i > n)
dimSize[i - 1] = tensor->dimSize[i];
}
return dimSize;
}
/*
compute the cross entropy loss
loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions
......@@ -247,73 +238,45 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim)
{
DTYPE loss = 0;
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
int unitNum = output->dimSize[n];
CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
/* l = log(output) */
_Log(output, logBuf);
if(weight != NULL){
XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
/* multiply gold with weight by broadcast wg = mulDim(g * w) */
_MultiplyDim(gold, weight, weightBuf, n, 0);
/* multiply weighted gold with log(output) wgl = mul(wg, l) */
_Multiply(weightBuf, logBuf, mulBuf, 0);
DelTensorBuf(weightBuf);
}
else{
/* multiply gold with log(output) gl = mul(g, l) */
_Multiply(gold, logBuf, mulBuf, 0);
int * dimSize = new int[order - 1];
for (int i = 0; i < order; i++) {
if(i < n)
dimSize[i] = output->dimSize[i];
else if(i > n)
dimSize[i - 1] = output->dimSize[i];
}
/* negate multiply result n = negate(mul) */
_NegateMe(mulBuf);
int * dimSize;
dimSize = reduceDimSize(output, n);
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
/* reduce sum all classes */
_ReduceSum(mulBuf, lossInter, n);
XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
DelTensorBuf(mulBuf);
DelTensorBuf(logBuf);
DTYPE loss;
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
/* compute the total loss */
if(padding != NULL) {
XTensor * temp = NewTensor(lossInter);
_Multiply(lossInter, padding, temp);
loss = _ReduceSumAll(temp);
delete temp;
}
else
loss = _ReduceSumAll(lossInter);
loss = _ReduceSumAll(lossBuf);
if(reduceWay == REDUCE_MEAN) {
int nonZeroNum;
if(padding == NULL) {
nonZeroNum = lossInter->unitNum;
nonZeroNum = lossBuf->unitNum;
}
else {
XTensor * tmp = NewTensor(padding);
XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp;
DelTensorBuf(tmp);
}
loss = loss / (DTYPE)nonZeroNum;
......@@ -326,7 +289,7 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
}
delete[] dimSize;
delete lossInter;
DelTensorBuf(lossBuf);
return loss;
}
......@@ -349,11 +312,7 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
const XTensor * padding, int leadingDim)
{
#ifdef USE_CUDA
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
DTYPE loss = 0;
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
......@@ -370,6 +329,23 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
}
#endif
int blockNum = 1;
int blockSize = 1;
int stride = 1;
......@@ -383,63 +359,78 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
DTYPE loss = 0;
int paddingPos;
int goldPos;
int nonZeroNum = 0;
if(weight == NULL) {
if(padding == NULL) {
nonZeroNum = blockNum;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
nonZeroNum = blockNum * stride;
for(int j = 0; j < blockSize; j++)
loss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
for(int i = 0; i < blockNum; i++) {
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0)
continue;
else{
nonZeroNum += 1;
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++)
loss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
if(*(paddingData + paddingPos) == 0)
continue;
else {
nonZeroNum += 1;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos));
}
}
}
}
}
}
}
else {
DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) {
nonZeroNum = blockNum;
nonZeroNum = blockNum * stride;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++)
loss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + k));
}
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
if(*(paddingData + i) == 0)
continue;
else{
nonZeroNum += 1;
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++)
loss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
if(*(paddingData + paddingPos) == 0)
continue;
else {
nonZeroNum += 1;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
loss += -(*(goldData + goldPos)) *
(DTYPE)log(*(outputData + goldPos)) *
(*(weightData + j));
}
}
}
}
}
}
}
......@@ -471,17 +462,10 @@ with respect to gold standard, and y this the model output
>> padding - specify a target value that is ignored and does not contribute to the loss computation
>> leadingDim - the leading dimension for the output
*/
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold,
const XTensor * weight, const XTensor * padding,
int leadingDim)
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight,
XTensor * padding, int leadingDim)
{
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
......@@ -497,7 +481,26 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
"Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
if(padding != NULL) {
for(int i = 0; i < order; i++){
if(i < n){
CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
}
else if(i > n){
CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
}
}
}
#ifdef USE_CUDA
if(output->devID >= 0) {
_CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
return;
}
#endif
int blockNum = 1;
int blockSize = 1;
int stride = 1;
......@@ -512,25 +515,35 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
int paddingPos;
int goldPos;
if(weight == NULL) {
if(padding == NULL) {
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
for(int j = 0; j < stride; j++) {
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
*(dedyData + goldPos) = -(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * unitSize);
else
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
if(*(paddingData + paddingPos) == 0)
*(dedyData + goldPos) = 0;
else
*(dedyData + goldPos) = -(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
}
}
}
......@@ -538,39 +551,45 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor
DTYPE * weightData = (DTYPE*)weight->data;
if(padding == NULL) {
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
for(int j = 0; j < stride; j++) {
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
*(dedyData + goldPos) = -(*(weightData + k)) *
(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
}
}
else {
DTYPE * paddingData = (DTYPE*)padding->data;
for(int i = 0; i < blockNum; i++) {
int beg = i * blockSize;
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * unitSize);
else
for(int j = 0; j < blockSize; j++) {
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
for(int j = 0; j < stride; j++) {
paddingPos = i * stride + j;
for(int k = 0; k < leadingDimSize; k++) {
goldPos = i * blockSize + j + k * stride;
if(*(paddingData + paddingPos) == 0)
*(dedyData + goldPos) = 0;
else
*(dedyData + goldPos) = -(*(weightData + k)) *
(*(goldData + goldPos)) /
(*(outputData + goldPos));
}
}
}
}
}
}
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp;
}
else {
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
}
//if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -26,80 +26,20 @@
#include "../XDevice.h"
#include "CrossEntropy.cuh"
#include "CrossEntropy.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/arithmetic/Div.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSumAll.h"
#include "../core/shape/Transpose.h"
#include "../core/shape/Unsqueeze.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
compute the cross entropy loss (cuda kernel)
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> lossData - the data pointer of loss tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> stride - the size of a data block
*/
__global__
void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
DTYPE * lossData, DTYPE * weightData,
DTYPE * paddingData, int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
DTYPE tmpLoss = 0;
if(weightData == NULL) {
if(paddingData == NULL) {
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j));
*(lossData + i) = tmpLoss;
}
}
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
else {
if(*(paddingData + i) == 0)
*(lossData + i) = tmpLoss;
else{
tmpLoss = 0;
for(int j = 0; j < blockSize; j++)
tmpLoss += -(*(goldData + beg + j)) *
(DTYPE)log(*(outputData + beg + j)) *
(*(weightData + j));
*(lossData + i) = tmpLoss;
}
}
}
}
/*
/*
compute the cross entropy loss (cuda version)
loss = sum_{i} (-gold_i * log(output_i))
where gold and output are distributions
......@@ -112,79 +52,27 @@ where gold and output are distributions
>> leadingDim - the leading dimension for the output
*/
void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim)
XTensor * loss, const XTensor * weight,
const XTensor * padding, int leadingDim)
{
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leadingDim!");
CheckNTErrors(XTensor::IsSameShaped(output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss),
"The loss tensor and padding tensor must be same shape!");
CheckNTErrors(loss->order == output->order - 1,
"Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
//GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
_Log(output, interBuf1);
_Multiply(gold, interBuf1, interBuf2);
int devIDBackup;
ProtectCudaDev(output->devID, devIDBackup);
if(weight != NULL)
_MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
DTYPE * lossData = (DTYPE*)loss->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(outputData, goldData, lossData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
BacktoCudaDev(output->devID, devIDBackup);
if(padding != NULL)
_MultiplyMe(loss, padding);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
}
/*
......@@ -230,87 +118,38 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
_CudaCrossEntropyFast(output, gold, lossInter, weight, padding, leadingDim);
_CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossInter);
loss = _ReduceSumAll(lossBuf);
if(reduceWay == REDUCE_MEAN) {
int nonZeroNum;
if(padding == NULL) {
nonZeroNum = lossInter->unitNum;
nonZeroNum = lossBuf->unitNum;
}
else {
XTensor * tmp = NewTensor(padding);
XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
delete tmp;
DelTensorBuf(tmp);
}
loss = loss / (DTYPE)nonZeroNum;
}
return loss;
}
/*
backward computation of cross entropy function (kernel version)
>> dedyData - the data pointer of dedy tensor
>> outputData - the data pointer of output tensor
>> goldData - the data pointer of gold tensor
>> weightData - the data pointer of weight tensor
>> paddingData - the data pointer of padding tensor
>> blockNum - the number of data blocks
>> blockSize - the size of a data block
*/
__global__
void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
DTYPE * weightData, DTYPE * paddingData,
int blockNum, int blockSize)
{
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i >= blockNum)
return;
int beg = i * blockSize;
if(weightData == NULL) {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else if(reduceWay == REDUCE_SUM) {
/* don't need to do anything */
}
else {
if(paddingData == NULL) {
for(int j = 0; j < blockSize; j++)
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
else {
if(*(paddingData + i) == 0)
memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
else
for(int j = 0; j < blockSize; j++) {
*(dedyData + beg + j) = -(*(weightData + j)) *
(*(goldData + beg + j)) /
(*(outputData + beg + j));
}
}
ShowNTErrors("TODO");
}
delete[] dimSize;
DelTensorBuf(lossBuf);
return loss;
}
/*
......@@ -330,85 +169,43 @@ with respect to gold standard, and y this the model output
*/
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight,
const XTensor * padding, int leadingDim)
XTensor * padding, int leadingDim)
{
int order = output->order;
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
int leadingDimSize = output->GetDim(n);
CheckNTErrors(n >= 0 && n < output->order,
"Wrong leading dimension!");
CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold),
"The output tensor and gold tensor must be of the same size!");
CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize,
"Wrong weight tensor!");
CheckNTErrors(padding == NULL || padding->order == output->order - 1,
"Wrong padding tensor!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE,
"TODO!");
int blockNum = 1;
int blockSize = 1;
int stride = 1;
for(int i = n + 1; i < order; i++)
stride *= output->GetDim(i);
blockSize = stride * leadingDimSize;
blockNum = output->unitNum / blockSize;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int devIDBackup;
ProtectCudaDev(output->devID, devIDBackup);
DTYPE * dedyData = (DTYPE*)dedy->data;
DTYPE * outputData = (DTYPE*)output->data;
DTYPE * goldData = (DTYPE*)gold->data;
if(weight == NULL) {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
NULL, (DTYPE*)padding->data,
blockNum, blockSize);
}
else {
if(padding == NULL)
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, NULL,
blockNum, blockSize);
else
KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
(dedyData, outputData, goldData,
(DTYPE*)weight->data, (DTYPE*)padding->data,
blockNum, blockSize);
}
_Div(gold, output, dedy);
_NegateMe(dedy);
if(weight != NULL)
_MultiplyDimMe(dedy, weight, n);
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
delete tmp;
}
else {
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedy->order;
int * dims = new int[order];
memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
_MultiplyDimMe(dedy, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedy->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
BacktoCudaDev(output->devID, devIDBackup);
//if(padding != NULL) {
// XTensor * tmp = NewTensor(padding);
// _IsNonZero(padding, tmp);
// int nonZeroNum = (int)_ReduceSumAll(tmp);
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
// delete tmp;
//}
//else {
// _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
//}
}
......
......@@ -40,7 +40,7 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
/* backward computation of cross entropy function */
void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL,
const XTensor * padding = NULL, int leadingDim = -1);
XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -52,9 +52,9 @@ DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
const XTensor * padding = NULL, int leadingDim = -1);
/* backward computation of cross entropy function */
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold,
const XTensor * weight = NULL, const XTensor * padding = NULL,
int leadingDim = -1);
void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
const XTensor * gold, const XTensor * weight = NULL,
XTensor * padding = NULL, int leadingDim = -1);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -279,8 +279,8 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
*/
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
......@@ -292,7 +292,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return;
}
#endif
......
......@@ -22,6 +22,7 @@
#include "LogSoftmax.h"
#include "LogSoftmax.cuh"
#include "Loss.cuh"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
......@@ -232,7 +233,8 @@ dE/dx = dE/dy * dy/dx
>> lossName - name of the loss function
*/
__global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size, LOSS_FUNCTION_NAME lossName)
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
int size, LOSS_FUNCTION_NAME lossName)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -371,10 +373,12 @@ better numerical stability.
>> leadDim - leading dimension (along which we perform reduction)
*/
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
leadDim = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU.");
......@@ -441,6 +445,26 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dimensionSize * stride, lossName);
}
}
if(padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else {
ShowNTErrors("TODO!");
......
......@@ -37,8 +37,8 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* de/dx (Cuda version) */
void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
......
......@@ -38,8 +38,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -486,8 +486,9 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
for (int i = 0; i < blockNum; i++) {
for (int j = 0; j < stride; j++) {
for (int k = 0; k < tLen; k++) {
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) = -(DTYPE)*(tp + i * stride * dimensionSize
+ j + stride * (tBeg + k)) / (DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
*(dedyp + i * stride * dimensionSize + j + stride * (yBeg + k)) =
-(DTYPE)*(tp + i * stride * dimensionSize + j + stride * (tBeg + k)) /
(DTYPE)*(yp + i * stride * dimensionSize + j + stride * (yBeg + k));
}
}
}
......
......@@ -174,8 +174,8 @@ See more details in LogSoftmaxBackward(...)
>> leadDim - leading dimension (along which we perform reduction)
*/
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
......@@ -188,7 +188,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA
if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
return;
}
#endif
......
......@@ -24,6 +24,7 @@
#include "Loss.cuh"
#include "../core/reduce/ReduceSum.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/shape/Unsqueeze.h"
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
......@@ -309,9 +310,11 @@ See more details in SoftmaxBackward
*/
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName)
{
int n = leadDim < 0 ? y->order - 1 : leadDim;
CheckNTErrors((x->devID >= 0), "Backward computation of log softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
CheckNTErrors((y->order >= 1), "Empty tensor!");
......@@ -329,6 +332,24 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(lossName == CROSSENTROPY || lossName == SQUAREDERROR){
_Sum(y, gold, dedx, -1.0F);
if(padding != NULL) {
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum/dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else if(lossName == ONEHOTERROR){
ShowNTErrors("TODO!");
......
......@@ -37,8 +37,8 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
/* de/dx (Cuda version) */
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
#endif // USE_CUDA
......
......@@ -35,8 +35,8 @@ XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
XTensor * dedy, XTensor * dedx,
XTensor * padding, int leadDim,
LOSS_FUNCTION_NAME lossName);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -169,8 +169,8 @@ bool TestDropout2()
_DropoutBackward(y, x, dedy, dedx, 1, dropProb);
/* check result */
y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy");
//y->Dump(stderr, "y");
//dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
/* GPU test */
......@@ -193,8 +193,8 @@ bool TestDropout2()
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);
/* check result */
yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU");
//yGPU->Dump(stderr, "yGPU");
//dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
delete x;
......
......@@ -146,7 +146,7 @@ bool TestLogSoftmax2()
_LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -174,7 +174,7 @@ bool TestLogSoftmax2()
_LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
......@@ -250,7 +250,7 @@ bool TestLogSoftmax3()
_LogSoftmax(x, y, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
_LogSoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, SQUAREDERROR);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -278,7 +278,7 @@ bool TestLogSoftmax3()
_LogSoftmax(xGPU, yGPU, 1);
/* call LogSoftmaxBackward function */
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
_LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, SQUAREDERROR);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
......@@ -66,7 +66,9 @@ bool TestPower1()
bUser = Power(*a, 2.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -88,7 +90,9 @@ bool TestPower1()
bUserGPU = Power(*aGPU, 2.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......@@ -153,7 +157,9 @@ bool TestPower2()
bUser = Power(*a, 1.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -175,7 +181,9 @@ bool TestPower2()
bUserGPU = Power(*aGPU, 1.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......@@ -214,7 +222,7 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0F, 1.0F},
DTYPE aData[3][2] = { {1.0F, 1.0F},
{2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0F, 1.0F},
......@@ -240,7 +248,9 @@ bool TestPower3()
bUser = Power(*a, 0.0F);
/* check results */
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) &&
aMe->CheckData(answer, aUnitNum, 1e-4F) &&
bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -262,7 +272,9 @@ bool TestPower3()
bUserGPU = Power(*aGPU, 0.0F);
/* check results */
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) &&
aMeGPU->CheckData(answer, aUnitNum, 1e-4F) &&
bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/
#include "TReduceSum.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -155,6 +156,457 @@ bool TestReduceSum1()
#endif // USE_CUDA
}
/*
case 2: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B >= 128
(50, 1000000) -> (50), dim = 1
*/
bool TestReduceSum2()
{
/* a tensor of size (50, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A >= 10, B < 128
(1000000, 50) -> (1000000), dim = 1
*/
bool TestReduceSum3()
{
/* a tensor of size (1000000, 50) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 1000000;
sDimSize[1] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (1000000) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 1000000;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C = 1, A < 10, B is free
(5, 1000000) -> (5), dim = 1
*/
bool TestReduceSum4()
{
/* a tensor of size (5, 1000000) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 5;
sDimSize[1] = 1000000;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (5) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 5;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 5: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C > 4096
(500, 1000, 500) -> (500, 500), dim = 1
*/
bool TestReduceSum5()
{
/* a tensor of size (500, 1000, 500) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 500;
sDimSize[1] = 1000;
sDimSize[2] = 500;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (500, 500) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 6: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
C != 1, A*C <= 4096
(50, 10000, 50) -> (50, 50), dim = 1
*/
bool TestReduceSum6()
{
/* a tensor of size (50, 10000, 50) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 50;
sDimSize[1] = 10000;
sDimSize[2] = 50;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a tensor of size (50, 50) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 50;
tDimSize[1] = 50;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * answer = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
_SetDataFixedFloat(s, 1.0F);
_SetDataFixedFloat(answer, (float)s->GetDim(1));
/* call ReduceSum function */
_ReduceSum(s, t, 1);
tUser = ReduceSum(*s, 1);
/* check results */
cpuTest = t->CheckData(answer->data, tUnitNum) && tUser.CheckData(answer->data, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F);
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1);
tUserGPU = ReduceSum(*sGPU, 1);
/* check results */
gpuTest = tGPU->CheckData(answer->data, tUnitNum) && tUserGPU.CheckData(answer->data, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete answer;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete answer;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -175,6 +627,51 @@ bool TestReduceSum()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceSum2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* case 3 test */
//caseFlag = TestReduceSum3();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 3 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestReduceSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
///* case 5 test */
//caseFlag = TestReduceSum5();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 5 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 6 test */
caseFlag = TestReduceSum6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -146,7 +146,7 @@ bool TestSoftmax2()
_Softmax(x, y, 1);
/* call SoftmaxBackward function */
_SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
_SoftmaxBackward(g, y, x, dedy, dedx, NULL, 1, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
......@@ -174,7 +174,7 @@ bool TestSoftmax2()
_Softmax(xGPU, yGPU, 1);
/* call SoftmaxBackward function */
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
_SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NULL, 1, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
......
......@@ -20,8 +20,9 @@
*/
#include "TSumDim.h"
#include "../core/arithmetic/SumDim.h"
#include "../XTensor.h"
#include "../core/arithmetic/SumDim.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -251,6 +252,225 @@ bool TestSumDim2()
#endif // USE_CUDA
}
/*
case 3: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(20, 40, 4000) + (40) = (20, 40, 4000), dim = 1.
*/
bool TestSumDim3()
{
/* a tensor of size (20, 40, 4000) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 20;
aDimSize[1] = 40;
aDimSize[2] = 4000;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 40;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 4: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting.
In this case,
(200, 40, 4000) + (40) = (200, 40, 4000), dim = 1.
*/
bool TestSumDim4()
{
/* a tensor of size (200, 40, 4000) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 1000000;
aDimSize[1] = 50;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (40) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 50;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor * answer = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F);
_SetDataFixedFloat(answer, 1.0F);
/* call SumDim function */
_SumDim(a, b, c, 1);
_SumDim(cMe, b, 1);
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer->data, aUnitNum) &&
cMe->CheckData(answer->data, aUnitNum) &&
cUser.CheckData(answer->data, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetZeroAll();
cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F);
/* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1);
_SumDim(cMeGPU, bGPU, 1);
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer->data, aUnitNum) &&
cMeGPU->CheckData(answer->data, aUnitNum) &&
cUserGPU.CheckData(answer->data, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete answer;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -279,6 +499,24 @@ bool TestSumDim()
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSumDim3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
///* case 4 test */
//caseFlag = TestSumDim4();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 4 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 4 passed!\n");
/* other cases test */
/*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论