Commit 3501c0fa by ltb

merge reducemax(float16) reducesum(float16) div(float16) negate(float16)…

merge reducemax(float16) reducesum(float16) div(float16) negate(float16) multiplydidm(float16) matrixmul(all,float16/int8) sum(float16/int/int8) sign/sub/sumdim/subdim( float16)
parent fc5a630a
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* backward computation for data operation
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
*/
#include "XNoder.h"
#include "XBackwardData.h"
#include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h"
namespace nts{
/* compute dE/dx of a node */
void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
{
CheckNTErrors(node->grad != NULL, "No gradient found!");
XLink &income = node->income;
int operID = income.typeID;
if(operID == GETANDSET_CONVERTDATATYPE)
GradConvertDataType(node, isEfficent);
else if(operID == GETANDSET_INDEXTOONEHOT)
GradIndexToOnehot(node, isEfficent);
else if(operID == GETANDSET_ONEHOTTOINDEX)
GradOnehotToIndex(node, isEfficent);
else{
ShowNTErrors("TODO!");
}
}
/* indicates whether the node is for a data operation */
bool XDataGrad::IsDataOP(XTensor * node)
{
XLink &income = node->income;
return (income.typeID & DATA_BASE) != 0;
}
/*
gradient computation for convert datatype
for
b = converdatatype(a)
we have
dE/da = convertdatatype(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
node->visitMark = NODE_FINISHED;
}
/*
gradient computation for OnehotToIndex
for
b = OnehotToIndex(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED;
}
/*
gradient computation for IndexToOnehot
for
b = IndexToOnehot(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED;
}
} // namespace nts(NiuTrans.Tensor)
...@@ -52,7 +52,15 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -52,7 +52,15 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * dedy = output->grad; XTensor * dedy = output->grad;
if (income.tailNum == 1) { if (income.tailNum == 1) {
_SetDataFixed(dedy, 1.0F); if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else
ShowNTErrors("TODO");
return; return;
} }
...@@ -136,7 +144,15 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, ...@@ -136,7 +144,15 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
if(gold == NULL){ if(gold == NULL){
_SetDataFixed(dedy, 1.0F); if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return; return;
} }
......
...@@ -735,7 +735,7 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient) ...@@ -735,7 +735,7 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
if (!isEfficient || b->isGrad) { if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
_Multiply(node->grad, a, b->grad, 1.0F);; _Multiply(node->grad, a, b->grad, 1.0F);
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -855,7 +855,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient) ...@@ -855,7 +855,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F); _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
...@@ -1319,7 +1318,7 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient) ...@@ -1319,7 +1318,7 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0); //DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad); _Sum(a->grad, node->grad, a->grad);
......
...@@ -68,7 +68,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent) ...@@ -68,7 +68,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
bool XShapeGrad::IsShapeOP(XTensor * node) bool XShapeGrad::IsShapeOP(XTensor * node)
{ {
XLink &income = node->income; XLink &income = node->income;
return (income.typeID & SHAPE_BASE) != 0; return (income.typeID & DATA_BASE) != 0;
} }
/* post processing of a node */ /* post processing of a node */
...@@ -271,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient) ...@@ -271,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
XTensor * last = NULL; XTensor * last = NULL;
XList smalls(income.tailNum); TensorList smalls(income.tailNum);
XList smallsGrad(income.tailNum); TensorList smallsGrad(income.tailNum);
bool mergeOnly = true; bool mergeOnly = true;
for(int i = 0; i < income.tailNum; i++){ for(int i = 0; i < income.tailNum; i++){
XTensor * tail = income.tails[i]; XTensor * tail = income.tails[i];
...@@ -440,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient) ...@@ -440,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
/* we compute the gradient for current node, rather than for /* we compute the gradient for current node, rather than for
child node, i.e., we use the outgoing edge here */ child node, i.e., we use the outgoing edge here */
XLink &outgo = node->outgo; XLink &outgo = node->outgo;
XList splits(outgo.tailNum); TensorList splits(outgo.tailNum);
int whereToSplit = -1; int whereToSplit = -1;
int splitNum = 0; int splitNum = 0;
......
...@@ -54,7 +54,7 @@ private: ...@@ -54,7 +54,7 @@ private:
static static
void GradGather(XTensor * node, bool isEfficent); void GradGather(XTensor * node, bool isEfficent);
/* gradient computation for dropout with indexs */ /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
static static
void GradDropoutWithIndex(XTensor * node, bool isEfficent); void GradDropoutWithIndex(XTensor * node, bool isEfficent);
......
...@@ -37,16 +37,16 @@ struct XNet ...@@ -37,16 +37,16 @@ struct XNet
unsigned int id; unsigned int id;
/* tensor nodes of the network (in order) */ /* tensor nodes of the network (in order) */
XList nodes; TensorList nodes;
/* tensor nodes to keep gradient for output (e.g., SGD)*/ /* tensor nodes to keep gradient for output (e.g., SGD)*/
XList gradNodes; TensorList gradNodes;
/* output nodes of the network */ /* output nodes of the network */
XList outputs; TensorList outputs;
/* input nodes of the network */ /* input nodes of the network */
XList inputs; TensorList inputs;
/* indicates whether the network just keeps the gradient for parameter tensors */ /* indicates whether the network just keeps the gradient for parameter tensors */
bool isGradEfficient; bool isGradEfficient;
...@@ -71,15 +71,15 @@ struct XNet ...@@ -71,15 +71,15 @@ struct XNet
/* backward propagation to obtain gradient /* backward propagation to obtain gradient
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient /* backward propagation to obtain gradient
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function /* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */ with a number of root nodes */
void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS); void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */ /* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false); void BackwardNode(XTensor * node, bool isEfficent = false);
...@@ -93,10 +93,10 @@ struct XNet ...@@ -93,10 +93,10 @@ struct XNet
/* traverse the net and find the topological order by /* traverse the net and find the topological order by
depth-first search (Tarjan's algorithm) */ depth-first search (Tarjan's algorithm) */
void Traverse(XList &roots); void Traverse(TensorList &roots);
/* depth-first search given a node (Tarjan's algorithm for topological ordering) */ /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
void TarjanVisit(XTensor * node, XList &orders, const unsigned int code); void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);
/* dump network information */ /* dump network information */
void Dump(FILE * file); void Dump(FILE * file);
......
...@@ -247,13 +247,13 @@ void Check(FNNModel &model) ...@@ -247,13 +247,13 @@ void Check(FNNModel &model)
/* make a hard copy of the fnn model */ /* make a hard copy of the fnn model */
void Copy(FNNModel &tgt, FNNModel &src) void Copy(FNNModel &tgt, FNNModel &src)
{ {
InitTensor(&tgt.embeddingW, &src.embeddingW); InitTensorV2(&tgt.embeddingW, &src.embeddingW);
for(int i = 0; i < MAX_HIDDEN_NUM; i++){ for(int i = 0; i < MAX_HIDDEN_NUM; i++){
InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]); InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]); InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
} }
InitTensor(&tgt.outputW, &src.outputW); InitTensorV2(&tgt.outputW, &src.outputW);
InitTensor(&tgt.outputB, &src.outputB); InitTensorV2(&tgt.outputB, &src.outputB);
tgt.n = src.n; tgt.n = src.n;
tgt.eSize = src.eSize; tgt.eSize = src.eSize;
...@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting ...@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
*/ */
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model) void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
{ {
InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem); InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
} }
/* /*
...@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting ...@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
*/ */
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model) void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
{ {
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
} }
...@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the gold standard */ /* the gold standard */
XTensor gold; XTensor gold;
/* the loss tensor */
XTensor lossTensor;
/* make the input tensor for position i */ /* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++) for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem); MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
...@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* forward computation */ /* forward computation */
Forward(inputs, output, model, net); Forward(inputs, output, model, net);
/* backward computation to obtain gradients */ /* backward computation to obtain gradients */
Backward(inputs, output, gold, CROSSENTROPY, model, grad, net); Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
...@@ -481,11 +486,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -481,11 +486,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* this is implemented by gather function */ /* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model); ForwardAutoDiff(ngrams, ngramNum, output, model);
///* this is implemented by multiply function */ /* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model); //ForwardAutoDiff(inputs, output, model);
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */ /* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY); autoDiffer.Backward(lossTensor);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */ /* update model parameters */
Update(model, grad, learningRate, true); Update(model, grad, learningRate, true);
...@@ -494,7 +501,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -494,7 +501,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* get probabilities */ /* get probabilities */
float prob = GetProb(output, gold); float prob = GetProb(output, gold);
loss += -prob; prob = ReduceSumAll(lossTensor);
loss += prob;
wordCount += ngramNum; wordCount += ngramNum;
wordCountTotal += ngramNum; wordCountTotal += ngramNum;
...@@ -537,8 +546,8 @@ update the model parameters using the delta rule ...@@ -537,8 +546,8 @@ update the model parameters using the delta rule
*/ */
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
{ {
XList paraList(10); TensorList paraList(10);
XList gradList(10); TensorList gradList(10);
paraList.Add(&model.outputW); paraList.Add(&model.outputW);
paraList.Add(&model.outputB); paraList.Add(&model.outputB);
...@@ -595,14 +604,14 @@ get prediction probabilites of the gold words ...@@ -595,14 +604,14 @@ get prediction probabilites of the gold words
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs) float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
{ {
XTensor probs; XTensor probs;
InitTensor(&probs, &output); InitTensorV2(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */ /* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&output, &gold, &probs); _Multiply(&output, &gold, &probs);
/* probability of each word */ /* probability of each word */
XTensor wprobs; XTensor wprobs;
InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem); InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1); _ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL) if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs); _CopyValues(&wprobs, wordProbs);
...@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs) ...@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
/* probability for the batch */ /* probability for the batch */
XTensor result; XTensor result;
InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem); InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
_ReduceSum(&probs, &result, 1); _ReduceSum(&probs, &result, 1);
return result.Get1D(0); return result.Get1D(0);
...@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise. ...@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise.
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem) int itemNum, int devID, XMem * mem)
{ {
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
tensor.SetZeroAll(); tensor.SetZeroAll();
...@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int batchSize = -1; int batchSize = -1;
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
XList eList(n - 1); TensorList eList(n - 1);
/* previoius n - 1 words */ /* previoius n - 1 words */
for(int i = 0; i < n - 1; i++){ for(int i = 0; i < n - 1; i++){
...@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* make a 2d tensor for the bias term */ /* make a 2d tensor for the bias term */
XTensor b2D; XTensor b2D;
InitTensor(&b2D, &s); InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize); _Unsqueeze(&b, &b2D, 0, batchSize);
/* introduce bias term: /* introduce bias term:
...@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s); _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor b2D; XTensor b2D;
InitTensor(&b2D, &s); InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize); _Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s); _Sum(&s, &b2D, &s);
...@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
XTensor dedsHidden; XTensor dedsHidden;
XTensor dedxBottom; XTensor dedxBottom;
if (depth > 0) if (depth > 0)
InitTensor(&dedsHidden, &dedx); InitTensorV2(&dedsHidden, &dedx);
InitTensor(&dedxBottom, &net.embeddingCat); InitTensorV2(&dedxBottom, &net.embeddingCat);
/* back-propagation from top to bottom in the stack of hidden layers /* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s) for each layer, h = f(s)
...@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
_CopyValues(&dedx, &gradPassed); _CopyValues(&dedx, &gradPassed);
} }
XList eList(n - 1); TensorList eList(n - 1);
/* back-propagation for the embedding layer */ /* back-propagation for the embedding layer */
for (int i = 0; i < n - 1; i++) { for (int i = 0; i < n - 1; i++) {
XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem); XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
eList.Add(dedy); eList.Add(dedy);
} }
...@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
} }
} }
InitTensor1D(&words, size, X_INT, model.devID, model.mem); InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size); words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words); embeddingBig = Gather(model.embeddingW, words);
...@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
/* /*
...@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model) ...@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
XTensor hidden; XTensor hidden;
XTensor b; XTensor b;
XList inputList(n - 1); TensorList inputList(n - 1);
for(int i = 0; i < n - 1; i++) for(int i = 0; i < n - 1; i++)
inputList.Add(inputs + i); inputList.Add(inputs + i);
...@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model)
/* prediction probabilities */ /* prediction probabilities */
XTensor probs; XTensor probs;
InitTensor1D(&probs, ngramNum); InitTensor1DV2(&probs, ngramNum);
/* get probabilities */ /* get probabilities */
float prob = GetProb(output, gold, &probs); float prob = GetProb(output, gold, &probs);
...@@ -1200,7 +1210,6 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1200,7 +1210,6 @@ void Test(const char * test, const char * result, FNNModel &model)
} }
fclose(file); fclose(file);
fclose(ofile);
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
......
...@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining) ...@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
XTensor q2; XTensor q2;
XTensor v2; XTensor v2;
XTensor kqv2; XTensor kqv2;
XList split; TensorList split;
kqv2 = MMul(kqv, wbig); kqv2 = MMul(kqv, wbig);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but i'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 16
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(int argc, char ** argv);
/* load data to buffer */
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
};
}
#endif
\ No newline at end of file
...@@ -31,6 +31,10 @@ namespace transformer ...@@ -31,6 +31,10 @@ namespace transformer
/* constructor */ /* constructor */
AttDecoder::AttDecoder() AttDecoder::AttDecoder()
{ {
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
attentionsEnde = NULL; attentionsEnde = NULL;
attEndeLayerNorms = NULL; attEndeLayerNorms = NULL;
} }
...@@ -38,6 +42,10 @@ AttDecoder::AttDecoder() ...@@ -38,6 +42,10 @@ AttDecoder::AttDecoder()
/* de-constructor */ /* de-constructor */
AttDecoder::~AttDecoder() AttDecoder::~AttDecoder()
{ {
delete[] attentions;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
delete[] attentionsEnde; delete[] attentionsEnde;
delete[] attEndeLayerNorms; delete[] attEndeLayerNorms;
} }
...@@ -68,7 +76,7 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -68,7 +76,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0); LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!"); CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */ /* embedding model */
embedder.InitModel(argc, argv, devID, mem, false); embedder.InitModel(argc, argv, devID, mem, false);
...@@ -160,6 +168,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -160,6 +168,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
x = fnnLayerNorms[i].Make(res); x = fnnLayerNorms[i].Make(res);
} }
x.SetName(DECODING_NAME);
return x; return x;
} }
......
...@@ -27,6 +27,9 @@ ...@@ -27,6 +27,9 @@
namespace transformer namespace transformer
{ {
#define DECODING_NAME "decoding"
#define DECODING_INPUT_NAME "decoding_input"
class AttDecoder class AttDecoder
{ {
public: public:
......
...@@ -140,6 +140,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -140,6 +140,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
x = fnnLayerNorms[i].Make(res); x = fnnLayerNorms[i].Make(res);
} }
x.SetName(ENCODING_NAME);
input.SetName(ENCODING_INPUT_NAME);
return x; return x;
} }
......
...@@ -33,6 +33,9 @@ using namespace nts; ...@@ -33,6 +33,9 @@ using namespace nts;
namespace transformer namespace transformer
{ {
#define ENCODING_NAME "encoding"
#define ENCODING_INPUT_NAME "encoding_input"
/* /*
base class of the encoder base class of the encoder
*/ */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> alpha - the parameter controls the length preference
<< return - length penaltyof the sequence (for each entry)
*/
XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
{
XTensor base;
XTensor lp;
base = (length + 5)/(1 + 5);
lp = Power(base, alpha);
return lp;
}
}
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University. * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -16,45 +16,33 @@ ...@@ -16,45 +16,33 @@
*/ */
/* /*
* backward computation for data operation * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26 * Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/ */
#include "../tensor/XTensor.h" #ifndef __T2TLENGTHPENALTY_H__
#include "../tensor/function/FHeader.h" #define __T2TLENGTHPENALTY_H__
#ifndef __XBACKWARDDATA_H__ #include "../../tensor/XTensor.h"
#define __XBACKWARDDATA_H__
namespace nts{ using namespace nts;
/* this class computes the gradient for tensor data operation given a node */ namespace transformer
class XDataGrad
{ {
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficent);
/* indicates whether the node is for a shaping operation */
static
bool IsDataOP(XTensor * node);
private:
/* gradient computation for ConverDataType: b = converdatatype(a, datatype) */ /* We intend to penalize short sequences because they have higher score
static in product of a sequence of probability-like terms and have more chances
void GradConvertDataType(XTensor * node, bool isEfficent); to beat others in search. */
class T2TLengthPenalizer
/* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */ {
static public:
void GradIndexToOnehot(XTensor * node, bool isEfficent); /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
/* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
static static
void GradOnehotToIndex(XTensor * node, bool isEfficent); XTensor GNMT(const XTensor & length, float alpha);
}; };
} // namespace nts(NiuTrans.Tensor) }
#endif #endif
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
#include "../../tensor/XUtility.h"
namespace transformer namespace transformer
{ {
...@@ -44,10 +45,13 @@ T2TModel::T2TModel() ...@@ -44,10 +45,13 @@ T2TModel::T2TModel()
/* de-constructor */ /* de-constructor */
T2TModel::~T2TModel() T2TModel::~T2TModel()
{ {
delete mem;
delete encoder; delete encoder;
delete decoder; delete decoder;
delete outputLayer; delete outputLayer;
/* we delete "mem" at the end because other members are using it and we must
remove the memory space before all tensors are destroyed. */
delete mem;
} }
/* /*
...@@ -81,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv) ...@@ -81,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv)
if(isMT) if(isMT)
decoder->InitModel(argc, argv, true, 0, devID, mem); decoder->InitModel(argc, argv, true, 0, devID, mem);
XList params(10); TensorList params(10);
GetParams(params); GetParams(params);
for(int i = 0; i < params.count; i++){ for(int i = 0; i < params.count; i++){
...@@ -359,12 +363,17 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, ...@@ -359,12 +363,17 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
dims[inputDec.order + 1] = len; dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem); InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
/* an upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in This matrix can be used to block the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0); _SetDataLowTri(&maskDec, 1e9F, 0);
//maskDec.Dump(stderr, "mask: ");
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F); _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
/* encoder-decoder mask that prevents the attention to padding dummy words */ /* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem); InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
...@@ -374,9 +383,18 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, ...@@ -374,9 +383,18 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem); XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1)); _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//paddingEnc.Dump(stderr, "paddingenc:");
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F); _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]); _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
DelTensorBuf(maskEncDecTMPDec); DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc); DelTensorBuf(maskEncDecTMPEnc);
delete[] dims; delete[] dims;
...@@ -385,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, ...@@ -385,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
get parameter matrics get parameter matrics
>> list - the list that keeps the parameter matrics >> list - the list that keeps the parameter matrics
*/ */
void T2TModel::GetParams(XList &list) void T2TModel::GetParams(TensorList &list)
{ {
list.Clear(); list.Clear();
list.Add(&outputLayer->w); list.Add(&outputLayer->w);
...@@ -442,10 +460,12 @@ dump the parameters ...@@ -442,10 +460,12 @@ dump the parameters
*/ */
void T2TModel::Dump(const char * fn) void T2TModel::Dump(const char * fn)
{ {
double startT = GetClockSec();
FILE * file = fopen(fn, "wb"); FILE * file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file"); CheckNTErrors(file, "Cannot open the model file");
XList params(100); TensorList params(100);
GetParams(params); GetParams(params);
...@@ -456,16 +476,20 @@ void T2TModel::Dump(const char * fn) ...@@ -456,16 +476,20 @@ void T2TModel::Dump(const char * fn)
fclose(file); fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n"); double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
} }
/* read the parameters */ /* read the parameters */
void T2TModel::Read(const char * fn) void T2TModel::Read(const char * fn)
{ {
double startT = GetClockSec();
FILE * file = fopen(fn, "rb"); FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot open the model file"); CheckNTErrors(file, "Cannot open the model file");
XList params(100); TensorList params(100);
GetParams(params); GetParams(params);
...@@ -476,7 +500,9 @@ void T2TModel::Read(const char * fn) ...@@ -476,7 +500,9 @@ void T2TModel::Read(const char * fn)
fclose(file); fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n"); double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
} }
} }
...@@ -98,7 +98,7 @@ public: ...@@ -98,7 +98,7 @@ public:
XTensor &maskDec, XTensor &maskEncDec); XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */ /* get parameter matrics */
void GetParams(XList &list); void GetParams(TensorList &list);
/* dump the parameters */ /* dump the parameters */
void Dump(const char * fn); void Dump(const char * fn);
......
...@@ -95,6 +95,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -95,6 +95,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
//output = LogSoftmax(MMul(x, w), -1); //output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1); output = Softmax(MMul(x, w), -1);
output.SetName(OUTPUT_NAME);
} }
} }
...@@ -29,6 +29,8 @@ using namespace nts; ...@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer namespace transformer
{ {
#define OUTPUT_NAME "output"
/* output layer */ /* output layer */
class T2TOutput class T2TOutput
{ {
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
*/
#include "T2TPredictor.h"
#include "../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TStateBundle::T2TStateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
{
if(states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if(states != NULL)
delete[] states;
states = new T2TState[num];
for(int i = 0; i < num; i++){
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
T2TPredictor::T2TPredictor()
{
startSymbol = -1;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
{
}
/*
create an initial state
>> model - the t2t model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
{
state->layersEnc.Clear();
state->layersDec.Clear();
XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
CheckNTErrors(encoding != NULL, "No encoding layers found!");
state->layersEnc.Add(encoding);
state->layersDec.Add(NULL);
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->GetDim(i);
dims[input->order - 1] = beamSize;
InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
state->probPath.SetZeroAll();
state->nstep.SetZeroAll();
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probablities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states (assuming that the current state has been read)
>> encoding - encoder output
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder
*/
void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor * inputEnc, XTensor * paddingEnc)
{
int dims[MAX_TENSOR_DIM_NUM];
next->layersEnc.Clear();
next->layersDec.Clear();
AttDecoder &decoder = *m->decoder;
/* word indices of previous positions */
XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
for(int i = 0; i < inputEnc->order - 1; i++)
dims[i] = inputEnc->GetDim(i);
dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
_SetDataFixedInt(&first, startSymbol);
/* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) {
inputDec = Identity(first);
}
else{
inputDec = GeneratePaths(s);
inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
inputDec = Concatenate(first, inputDec, inputDec.order - 1);
}
/* prediction probabilities */
XTensor &output = next->prob;
XTensor decoding;
XTensor decodingStep;
for(int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.GetDim(i);
dims[inputDec.order - 1] = inputDec.GetDim(-1);
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
SetDataFixedInt(paddingDec, 1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
XTensor selectSrc;
XTensor selectTgt;
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
int stride = decoding.GetDim(decoding.order - 2);
InitTensor1D(&selectSrc, 1, X_INT);
InitTensor1D(&selectTgt, 1, X_INT);
selectSrc.SetInt(stride - 1, 0);
selectTgt.SetInt(0, 0);
selectSrc.SetDevice(decoding.devID, decoding.mem);
selectTgt.SetDevice(decoding.devID, decoding.mem);
/* the decoder output of the last position */
decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
/* generate the output probabilities */
m->outputLayer->Make(decodingStep, output);
_LogMe(&output);
next->layersEnc.AddList(&s->layersEnc);
next->layersDec.Add(&inputDec);
next->layersDec.Add(&output);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for(int i = 0; i < state->stateNum; i++){
T2TState * cur = state->states + i;
int nsteps = 0;
while(cur != NULL){
nsteps++;
cur = cur->last;
}
if(nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2D(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for(int i = 0; i < state->stateNum; i++){
T2TState * cur = state->states + i;
int nsteps = 0;
while(cur != NULL){
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer
{
#define T2T_PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypothsis in translation. */
class T2TState
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypothesis
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState * last;
};
/* a bundle of states */
class T2TStateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypothesis is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypothesis */
XTensor nstep;
/* layers on the encoder side. We actually use the encoder output instead
of all hidden layers. */
TensorList layersEnc;
/* layers on the decoder side */
TensorList layersDec;
/* list of states */
T2TState * states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
T2TStateBundle();
/* de-constructor */
~T2TStateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regared as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
{
private:
/* pointer to the transformer model */
T2TModel * m;
/* current state */
T2TStateBundle * s;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TPredictor();
/* de-constructor */
~T2TPredictor();
/* create an initial state */
void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel * model, T2TStateBundle * state);
/* predict the next state */
void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle * state);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#include "T2TModel.h"
#include "T2TPredictor.h"
namespace transformer
{
/* The class orgnizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path respresents a translation hypothsis.
The output can be the path with the highest model score. */
class T2TSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float> * fullHypos;
/* array of the end symbols */
int * endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TSearch();
/* de-constructor */
~T2TSearch();
/* initialize the model */
void Init(int argc, char ** argv);
/* search for the most promising states */
void Search(T2TModel * model, XTensor * input, XTensor * padding,
XTensor * output, XTensor * score);
/* preparation */
void Prepare(int myBatchSize,int myBeamSize);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle * prev, T2TStateBundle * beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle * beam);
/* expand the search graph */
void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle * beam);
/* fill the hypotheis heap with incomplete hypothses */
void FillHeap(T2TStateBundle * beam);
/* save the output sequences in a tensor */
void Dump(XTensor * output, XTensor * score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* set end symbols for search */
void SetEnd(const int * tokens, const int tokenNum);
/* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle * beam);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle * beam);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include <math.h>
#include "T2TUtility.h"
#include "T2TTester.h"
#include "T2TSearch.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TTester::T2TTester()
{
}
/* de-constructor */
T2TTester::~T2TTester()
{
}
/* initialize the model */
void T2TTester::Init(int argc, char ** argv)
{
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
batchLoader.Init(argc, argv);
seacher.Init(argc, argv);
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
{
int wc = 0;
int ws = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
float loss = 0;
/* data files */
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot read the test file");
FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
double startT = GetClockSec();
wordCount = 0;
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
/* label */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int * seqs = new int[MILLION];
batchLoader.SetRandomBatch(false);
batchLoader.ClearBuf();
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, mem, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
CheckNTErrors(!model->isLM, "Only MT model is supported!");
XTensor output;
XTensor score;
seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
Dump(ofile, &output);
float prob = 0;
loss += -prob;
wc = batchEnc.GetDim(-1);
wordCount += wc;
wordCountTotal += wc;
sentCount += batchEnc.GetDim(-2);
batchCount += 1;
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr,
"[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
elapsed, sentCount, wordCount);
}
}
fclose(file);
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTester::Dump(FILE * file, XTensor * output)
{
int seqLength = output->GetDim(-1);
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
fprintf(file, "%d ", w);
if (w < 0)
break;
}
fprintf(file, "\n");
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* A week with no trips :)
*/
#ifndef __T2TTESTER_H__
#define __T2TTESTER_H__
#include "T2TSearch.h"
#include "T2TBatchLoader.h"
namespace transformer
{
/* This class translates test sentences with a trained model. */
class T2TTester
{
public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* for batching */
T2TBatchLoader batchLoader;
/* decoder for inference */
T2TSearch seacher;
public:
/* constructor */
T2TTester();
/* de-constructor */
~T2TTester();
/* initialize the model */
void Init(int argc, char ** argv);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* dump the result into the file */
void Dump(FILE * file, XTensor * output);
};
}
#endif
\ No newline at end of file
...@@ -23,35 +23,14 @@ ...@@ -23,35 +23,14 @@
#define __T2TTRAINER_H__ #define __T2TTRAINER_H__
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TBatchLoader.h"
#include "../../tensor/function/FHeader.h" #include "../../tensor/function/FHeader.h"
#define MAX_SEQUENCE_LENGTH 1024 * 4
using namespace nts; using namespace nts;
namespace transformer namespace transformer
{ {
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */ /* trainer of the T2T model */
class T2TTrainer class T2TTrainer
{ {
...@@ -62,42 +41,6 @@ public: ...@@ -62,42 +41,6 @@ public:
/* parameter array */ /* parameter array */
char ** argArray; char ** argArray;
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* dimension size of each inner layer */ /* dimension size of each inner layer */
int d; int d;
...@@ -139,10 +82,10 @@ public: ...@@ -139,10 +82,10 @@ public:
float adamBeta2T; float adamBeta2T;
/* list of the moment of the parameter matrics */ /* list of the moment of the parameter matrics */
XList moments; TensorList moments;
/* list of the 2nd order moment of the parameter matrics */ /* list of the 2nd order moment of the parameter matrics */
XList moments2nd; TensorList moments2nd;
/* indicates whether the data file is shuffled for training */ /* indicates whether the data file is shuffled for training */
bool isShuffled; bool isShuffled;
...@@ -159,25 +102,14 @@ public: ...@@ -159,25 +102,14 @@ public:
/* number of batches on which we do model update */ /* number of batches on which we do model update */
int updateStep; int updateStep;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* indicates whether we intend to debug the net */ /* indicates whether we intend to debug the net */
bool isDebugged; bool isDebugged;
/* bucket size */ /* indicates whether the sequence is sorted by length */
int bucketSize; bool isLenSorted;
/* for batching */
T2TBatchLoader batchLoader;
public: public:
/* constructor */ /* constructor */
...@@ -198,46 +130,6 @@ public: ...@@ -198,46 +130,6 @@ public:
/* make a checkpoint */ /* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id); void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* load data to buffer */
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
/* get word probabilities for a batch of sequences */ /* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs); float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TTrainer.h" #include "T2TTrainer.h"
#include "T2TPredictor.h"
#include "T2TTester.h"
#include "../../tensor/XDevice.h" #include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h" #include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h" #include "../../tensor/XGlobal.h"
...@@ -47,6 +49,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -47,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)
ShowParams(argc, args); ShowParams(argc, args);
bool isBeamSearch = false;
char * trainFN = new char[MAX_LINE_LENGTH]; char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH]; char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH]; char * testFN = new char[MAX_LINE_LENGTH];
...@@ -56,6 +59,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -56,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "model", modelFN, ""); LoadParamString(argc, args, "model", modelFN, "");
LoadParamString(argc, args, "test", testFN, ""); LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
...@@ -65,27 +69,34 @@ int TransformerMain(int argc, const char ** argv) ...@@ -65,27 +69,34 @@ int TransformerMain(int argc, const char ** argv)
T2TModel model; T2TModel model;
model.InitModel(argc, args); model.InitModel(argc, args);
//if(strcmp(modelFN, ""))
// model.Read(modelFN);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */ /* save the final model */
//if(strcmp(modelFN, "") && strcmp(trainFN, "")) if(strcmp(modelFN, "") && strcmp(trainFN, ""))
//model.Dump(modelFN); model.Dump(modelFN);
/* load the model if neccessary */ /* load the model if neccessary */
//if(strcmp(modelFN, "")) if(strcmp(modelFN, ""))
//model.Read(modelFN); model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, "")){
/* beam search */
if(isBeamSearch){
T2TTester searcher;
searcher.Init(argc, args);
searcher.Test(testFN, outputFN, &model);
}
/* forced decoding */
else{
T2TTrainer tester; T2TTrainer tester;
tester.Init(argc, args); tester.Init(argc, args);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
tester.Test(testFN, outputFN, &model); tester.Test(testFN, outputFN, &model);
}
}
delete[] trainFN; delete[] trainFN;
delete[] modelFN; delete[] modelFN;
......
...@@ -201,7 +201,8 @@ void XDevice::SetGPUDevice(int devID) ...@@ -201,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
cudaError_t error = cudaSetDevice(devID); cudaError_t error = cudaSetDevice(devID);
if (error != cudaSuccess){ if (error != cudaSuccess){
fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error)); fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
devID, error, cudaGetErrorString(error));
exit(1); exit(1);
} }
#else #else
...@@ -216,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID) ...@@ -216,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
SetFastFlags(); SetFastFlags();
} }
/* switch to a get current dev */ /* get the id of the current GPU device */
int XDevice::GetGPUDevice() int XDevice::GetGPUDevice()
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -224,7 +225,8 @@ int XDevice::GetGPUDevice() ...@@ -224,7 +225,8 @@ int XDevice::GetGPUDevice()
cudaError_t error = cudaGetDevice(&devID); cudaError_t error = cudaGetDevice(&devID);
if (error != cudaSuccess){ if (error != cudaSuccess){
fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error)); fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
devID, error, cudaGetErrorString(error));
exit(1); exit(1);
} }
...@@ -248,7 +250,7 @@ void XDevice::SetFastFlags() ...@@ -248,7 +250,7 @@ void XDevice::SetFastFlags()
#endif #endif
} }
/* reset cuda flag for more efficient cuda execution (all devices) */ /* reset the cuda flag for more efficient cuda execution (all devices) */
void XDevice::SetFastFlagsAllDevices() void XDevice::SetFastFlagsAllDevices()
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -266,10 +268,6 @@ XDevManager::XDevManager() ...@@ -266,10 +268,6 @@ XDevManager::XDevManager()
{ {
Clear(); Clear();
Init(); Init();
#ifndef USE_CPP11
fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
#endif
} }
/* de-constructor */ /* de-constructor */
...@@ -278,7 +276,7 @@ XDevManager::~XDevManager() ...@@ -278,7 +276,7 @@ XDevManager::~XDevManager()
} }
/* initialize it and get the CPU and GPU information */ /* initialization */
void XDevManager::Init() void XDevManager::Init()
{ {
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
...@@ -322,7 +320,7 @@ void XDevManager::Clear() ...@@ -322,7 +320,7 @@ void XDevManager::Clear()
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the handle of GPU */ /* get the handle of a given GPU */
cublasHandle_t * XDevManager::GetCudaHandle(const int devID) cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
{ {
CheckNTErrors(devID < nGPU, "index of GPU is out of range."); CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
...@@ -330,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID) ...@@ -330,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return GPUs[devID].GetCublasHandle(); return GPUs[devID].GetCublasHandle();
} }
/* get the stream of cuda */ /* get the stream of a given GPU */
cudaStream_t * XDevManager::GetCudaStream(const int devID) cudaStream_t * XDevManager::GetCudaStream(const int devID)
{ {
CheckNTErrors(devID < nGPU, "index of GPU is out of range."); CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
...@@ -478,7 +476,7 @@ split a string ...@@ -478,7 +476,7 @@ split a string
>> items - splitting result >> items - splitting result
<< return - how many items are there << return - how many items are there
*/ */
int SplitALine(char * inputString, const char * seperator, XList * items) int SplitALine(char * inputString, const char * seperator, StrList* items)
{ {
items->Clear(); items->Clear();
...@@ -527,12 +525,12 @@ get device ids for the given device information ...@@ -527,12 +525,12 @@ get device ids for the given device information
devInfo = "0:CPU-1 1:GPU-0 2:CPU-1" devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
means that the first device is CPU, the second device means that the first device is CPU, the second device
is GPU-0, the third device is CPU. is GPU-0, the third device is CPU.
>> devIDs - device sequence specified by devInfo >> devIDs - device IDs specified by devInfo
<< return - number of devices << return - number of devices
*/ */
int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs) int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
{ {
XList * terms = new XList(1); StrList* terms = new StrList(1);
SplitALine(devInfo, " ", terms); SplitALine(devInfo, " ", terms);
for(int i = 0; i < terms->count; i++){ for(int i = 0; i < terms->count; i++){
...@@ -569,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs) ...@@ -569,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
return devCount; return devCount;
} }
/* show id sequence */ /* show device IDs */
void XDevManager::ShowDeviceIDs(char * devInfo, char * msg) void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
{ {
msg[0] = 0; msg[0] = 0;
......
...@@ -236,6 +236,18 @@ extern XDevManager GDevs; ...@@ -236,6 +236,18 @@ extern XDevManager GDevs;
cudaSetDevice(devIDBackup); \ cudaSetDevice(devIDBackup); \
} \ } \
#define CheckDev(a, b) \
{ \
if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
exit(1); \
} \
else if (a >= 0 && b >= 0 && a != b) { \
fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
exit(1); \
} \
} \
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
#endif #endif
...@@ -32,8 +32,6 @@ ...@@ -32,8 +32,6 @@
#ifndef WIN32 #ifndef WIN32
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#include <stdint.h>
typedef int8_t __int8;
#endif #endif
// the CUDA stuff // the CUDA stuff
...@@ -45,10 +43,6 @@ typedef int8_t __int8; ...@@ -45,10 +43,6 @@ typedef int8_t __int8;
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts {
#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
#define USE_CPP11
#endif
#define _XINLINE_ #define _XINLINE_
//#define DOUBELPRICSION //#define DOUBELPRICSION
...@@ -159,7 +153,9 @@ extern bool useCUDA; ...@@ -159,7 +153,9 @@ extern bool useCUDA;
#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}} #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
#define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}} #define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true #define B2I(V) V == 0 ? false : true
#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)
/* BLAS interfaces */ /* BLAS interfaces */
#ifdef DOUBELPRICSION #ifdef DOUBELPRICSION
......
...@@ -31,15 +31,15 @@ namespace nts{ ...@@ -31,15 +31,15 @@ namespace nts{
/* constructor */ /* constructor */
template<HeapType hType, typename T> template<HeapType hType, typename T>
XHeap<hType, T>::XHeap()
{
}
/* constructor */
template<HeapType hType, typename T>
XHeap<hType, T>::XHeap(int mySize, XMem * myMem) XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
{ {
mem = myMem; Init(mySize, myMem);
size = mySize;
count = 0;
if (mem == NULL)
items = new HeapNode<T>[mySize];
else
mem->Alloc(mem->devID, mySize * sizeof(T));
} }
/* deconstructor */ /* deconstructor */
...@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap() ...@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
} }
template<HeapType hType, typename T> template<HeapType hType, typename T>
void XHeap<hType, T>::Init(int mySize, XMem * myMem)
{
mem = myMem;
size = mySize;
count = 0;
if (mem == NULL)
items = new HeapNode<T>[mySize];
else
mem->Alloc(mem->devID, mySize * sizeof(T));
}
template<HeapType hType, typename T>
void XHeap<hType, T>::Clear(T initValue) void XHeap<hType, T>::Clear(T initValue)
{ {
count = 0; count = 0;
...@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End() ...@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
template<HeapType hType, typename T> template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node) _XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
{ {
//CheckNTErrors((count < size), "Heap is full!"); if (count < size) {
items[count] = node; items[count] = node;
Up(count); Up(count);
count++; count++;
}
else if(count == size){
HeapNode<T> & item0 = items[0];
if (hType == MIN_HEAP && item0.value >= node.value)
return;
else if (hType == MAX_HEAP && item0.value <= node.value)
return;
items[0] = node;
Down(0);
}
else {
ShowNTErrors("Overflow of the heap!");
}
} }
/* replace the top-most item and update the heap */ /* replace the top-most item and update the heap */
...@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node) ...@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
template<HeapType hType, typename T> template<HeapType hType, typename T>
_XINLINE_ HeapNode<T> XHeap<hType, T>::Pop() _XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
{ {
//CheckNTErrors((size > 0), "Empty heap!"); CheckNTErrors(count > 0, "Empty heap!");
HeapNode<T> node = items[0]; HeapNode<T> node = items[0];
items[0] = items[count - 1]; items[0] = items[count - 1];
count--; count--;
......
...@@ -39,7 +39,7 @@ template <typename T> ...@@ -39,7 +39,7 @@ template <typename T>
struct HeapNode struct HeapNode
{ {
/* node index */ /* node index */
int index; long long index;
/* value of the node */ /* value of the node */
T value; T value;
...@@ -52,9 +52,16 @@ struct HeapNode ...@@ -52,9 +52,16 @@ struct HeapNode
HeapNode(int i, T v) HeapNode(int i, T v)
{ {
index = i; index = (long long)i;
value = v; value = v;
}; };
HeapNode(void * i, T v)
{
index = (long long)i;
value = v;
}
}; };
/* a heap that keeps a data array of T */ /* a heap that keeps a data array of T */
...@@ -76,11 +83,17 @@ public: ...@@ -76,11 +83,17 @@ public:
public: public:
/* constructor */ /* constructor */
XHeap();
/* constructor */
XHeap(int mySize, XMem * myMem = NULL); XHeap(int mySize, XMem * myMem = NULL);
/* deconstructor */ /* deconstructor */
~XHeap(); ~XHeap();
/* initialization */
void Init(int mySize, XMem * myMem = NULL);
/* clear the data */ /* clear the data */
void Clear(T initValue); void Clear(T initValue);
...@@ -107,6 +120,9 @@ public: ...@@ -107,6 +120,9 @@ public:
/* move item k up the tree */ /* move item k up the tree */
void Up(int k); void Up(int k);
/* how many items are kept in the heap */
inline int Count() { return count; };
}; };
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
......
...@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id ...@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
if(h == NULL) if(h == NULL)
return; return;
XList list(2); TensorList list(2);
list.Add(t1); list.Add((XTensor*)t1);
list.Add(t2); list.Add((XTensor*)t2);
MakeLink(&list, h, id); MakeLink(&list, h, id);
} }
...@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, ...@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
if (h == NULL) if (h == NULL)
return; return;
XList list(3); TensorList list(3);
list.Add(t1); list.Add((XTensor*)t1);
list.Add(t2); list.Add((XTensor*)t2);
list.Add(t3); list.Add((XTensor*)t3);
MakeLink(&list, h, id); MakeLink(&list, h, id);
} }
...@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor ...@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
>> h - head tensor >> h - head tensor
>> id - id of the edge type >> id - id of the edge type
*/ */
void XLink::MakeLink(const XList * list, XTensor * h, int id) void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
{ {
/* forward */ /* forward */
XLink &income = h->income; XLink &income = h->income;
...@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors ...@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
>> list - a list of output tensors >> list - a list of output tensors
>> id - id of the edge type >> id - id of the edge type
*/ */
void XLink::MakeLink(XTensor * t, XList * list, int id) void XLink::MakeLink(XTensor * t, TensorList * list, int id)
{ {
/* forward */ /* forward */
for(int i = 0; i < list->count; i++){ for(int i = 0; i < list->count; i++){
...@@ -624,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target) ...@@ -624,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
ClearIncoming(target); ClearIncoming(target);
int tailNum = reference->income.tailNum; int tailNum = reference->income.tailNum;
XList tails(tailNum); TensorList tails(tailNum);
for(int i = 0; i < tailNum; i++){ for(int i = 0; i < tailNum; i++){
XTensor * tail = (XTensor*)reference->income.tails[i]; XTensor * tail = (XTensor*)reference->income.tails[i];
tails.Add(tail); tails.Add(tail);
...@@ -743,7 +743,7 @@ search for a node in a top-down manner by its name ...@@ -743,7 +743,7 @@ search for a node in a top-down manner by its name
>> top - the top most node >> top - the top most node
<< return - the node we found << return - the node we found
*/ */
/*XTensor * XLink::SearchNode(XTensor * top, const char * name) XTensor * XLink::SearchNode(XTensor * top, const char * name)
{ {
if(!strcmp(top->name, name)) if(!strcmp(top->name, name))
return top; return top;
...@@ -758,7 +758,7 @@ search for a node in a top-down manner by its name ...@@ -758,7 +758,7 @@ search for a node in a top-down manner by its name
} }
return NULL; return NULL;
}*/ }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -138,17 +138,17 @@ struct XLink ...@@ -138,17 +138,17 @@ struct XLink
static static
void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id); void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
/* create a hyper edge with two input tensors and a output tensor */ /* create a hyper edge with three input tensors and a output tensor */
static static
void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id); void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
/* create a hyper edge with a list of input tensors and a output tensor */ /* create a hyper edge with a list of input tensors and a output tensor */
static static
void MakeLink(const XList * list, XTensor * h, int id); void MakeLink(const TensorList * list, XTensor * h, int id);
/* create a hyper edge with a input tensors and a list of output tensors */ /* create a hyper edge with a input tensors and a list of output tensors */
static static
void MakeLink(XTensor * h, XList * list, int id); void MakeLink(XTensor * h, TensorList * list, int id);
/* add a parameter */ /* add a parameter */
static static
...@@ -191,8 +191,8 @@ struct XLink ...@@ -191,8 +191,8 @@ struct XLink
void ShowNode(FILE * file, XTensor * node); void ShowNode(FILE * file, XTensor * node);
/* search a node in a top-down manner by its name */ /* search a node in a top-down manner by its name */
//static static
//XTensor * SearchNode(XTensor * top, const char * name); XTensor * SearchNode(XTensor * top, const char * name);
}; };
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -15,32 +15,31 @@ ...@@ -15,32 +15,31 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* *
* Implementation of list that keeps data items * Implementation of template list that keeps data items
* *
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17 * $Created by: HU Chi (huchinlp@foxmail.com)
* The first coding job this year!
* *
*/ */
#ifndef __XLIST_H__
#define __XLIST_H__
#include "XMem.h" #include "XMem.h"
#include "XGlobal.h" #include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */ #ifndef __TensorList_H__
namespace nts{ #define __TensorList_H__
typedef int (* ListCompare)(const void * item1, const void * item2); /* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* the XList class */ /* the TensorListBase class */
class XList template <typename T>
{ struct TensorListBase {
public: public:
/* data items */ /* data items */
void ** items; T *items;
/* number of items */ /* number of items */
int count; int count;
...@@ -49,56 +48,88 @@ public: ...@@ -49,56 +48,88 @@ public:
int maxNum; int maxNum;
/* the memory pool for data array allocation */ /* the memory pool for data array allocation */
XMem * mem; XMem* mem;
/* indicates whether data items are integers */
bool isIntList;
public: public:
/* constructor */ /* constructor */
XList(); TensorListBase();
/* constructor */ /* constructor */
XList(int myMaxNum, bool isIntListOrNot = false); TensorListBase(int myMaxNum);
/* constructor */ /* constructor */
XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false); TensorListBase(int myMaxNum, XMem* myMem);
/* de-constructor */ /* de-constructor */
~XList(); ~TensorListBase();
/* utilities */ /* add an item into the list */
void Create(int myMaxNum, XMem * myMem); void Add(T&& item);
void Add(const void * item);
void Add(void ** inputItems, int inputItemCount); /* add an item into the list */
void AddList(XList * l); void Add(const T& item);
void AddInt(int i);
void Insert(int pos, void * item); /* add a number of items into the list */
void * GetItem(int i) const; void Add(T* inputItems, int inputItemCount);
int GetItemInt(int i);
void SetItem(int i, void * item); /* append a list to the current list */
void SetItemInt(int i, int item); void AddList(TensorListBase* l);
int FindFirst(void * item); /* insert an item to the given position of the list */
void Insert(int pos, const T& item);
/* insert an item to the given position of the list */
void Insert(int pos, T&& item);
/* get the item at position i */
T& GetItem(int i) const;
/* set the item at position i */
void SetItem(int i, const T& item);
/* set the item at position i */
void SetItem(int i, T&& item);
/* find the position of the first matched item */
int FindFirst(const T& item);
/* clear the data array */
void Clear(); void Clear();
void ClearStringList();
void Sort(int itemSize, ListCompare comp); /* sort the list */
void Sort(int itemSize);
/* reverse the list */
void Reverse(); void Reverse();
/* remove the item at position i */
void Remove(int i); void Remove(int i);
XList * Copy(XMem * myMem);
/* copy the list */
TensorListBase* Copy(XMem* myMem);
/* shuffle the list */
void Shuffle(int nround = 10, int beg = -1, int len = 0); void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */ /* short */
_XINLINE_ void * Get(int i) {return GetItem(i);}; T& operator[] (int i) {
_XINLINE_ int GetInt(int i) {return GetItemInt(i);}; return GetItem(i);
_XINLINE_ void Set(int i, void * item) {SetItem(i, item);}; };
_XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);}; T& Get(int i) { return GetItem(i); };
void Set(int i, T item) { SetItem(i, item); };
}; };
extern XList NULLList; struct XTensor;
typedef TensorListBase<int> IntList;
typedef TensorListBase<char> CharList;
typedef TensorListBase<char*> StrList;
typedef TensorListBase<long> LongList;
typedef TensorListBase<float> FloatList;
typedef TensorListBase<short> ShortList;
typedef TensorListBase<void*> XList;
typedef TensorListBase<XTensor*> TensorList;
} } /* end of the nts (NiuTrans.Tensor) namespace */
/* end of the nts (NiuTrans.Tensor) namespace */
#endif #endif // __TensorList_H__
...@@ -34,6 +34,11 @@ namespace nts{ ...@@ -34,6 +34,11 @@ namespace nts{
int testxmemid = 0; int testxmemid = 0;
void * recordp = NULL; void * recordp = NULL;
/*
for managing the memories
*/
XMemManager GMems;
XMem * GMem; XMem * GMem;
/* constructor */ /* constructor */
...@@ -48,6 +53,7 @@ XMem::XMem() ...@@ -48,6 +53,7 @@ XMem::XMem()
strcpy(name, "xmem"); strcpy(name, "xmem");
signature = 0; signature = 0;
mergeFreeOTF = true; mergeFreeOTF = true;
isInitialized = false;
} }
/* /*
...@@ -58,7 +64,7 @@ constructor ...@@ -58,7 +64,7 @@ constructor
>> myMode - mode of running the memory pool >> myMode - mode of running the memory pool
UNI_FREE: free all the space at the end of using the memory pool UNI_FREE: free all the space at the end of using the memory pool
FREE_ON_THE_FLY: normal "malloc" and "free" mode FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of memory block >> myBlockSize - size of a memory block
>> myBlockNum - number of memory blocks >> myBlockNum - number of memory blocks
>> myBufSize - size of buffer >> myBufSize - size of buffer
*/ */
...@@ -103,7 +109,7 @@ initialize it ...@@ -103,7 +109,7 @@ initialize it
>> myMode - mode of running the memory pool >> myMode - mode of running the memory pool
UNI_FREE: free all the space at the end of using the memory pool UNI_FREE: free all the space at the end of using the memory pool
FREE_ON_THE_FLY: normal "malloc" and "free" mode FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of memory block >> myBlockSize - size of a memory block
>> myBlockNum - number of memory blocks >> myBlockNum - number of memory blocks
>> myBufSize - size of buffer >> myBufSize - size of buffer
*/ */
...@@ -164,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m ...@@ -164,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
#endif #endif
signature++; signature++;
isInitialized = true;
} }
/* free memory */ /* free memory */
...@@ -217,8 +224,8 @@ void XMem::Free(int myDevID, void * mem) ...@@ -217,8 +224,8 @@ void XMem::Free(int myDevID, void * mem)
} }
/* /*
get signature get the signature
<< return - return the signature << return - the signature
*/ */
MTYPE XMem::GetSignature() MTYPE XMem::GetSignature()
{ {
...@@ -226,7 +233,7 @@ MTYPE XMem::GetSignature() ...@@ -226,7 +233,7 @@ MTYPE XMem::GetSignature()
} }
/* /*
use string as the name of the memory pool set the name of the memory pool
>> myName - name of the memory pool >> myName - name of the memory pool
*/ */
void XMem::SetName(const char * myName) void XMem::SetName(const char * myName)
...@@ -259,7 +266,7 @@ void XMem::SetDevice(int myDevID) ...@@ -259,7 +266,7 @@ void XMem::SetDevice(int myDevID)
} }
/* /*
switch to the device (with fast cuda execution mode) we want to work switch to the device (with fast cuda execution mode) we intend to work on
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID) >> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
*/ */
void XMem::SetDeviceFast(int myDevID) void XMem::SetDeviceFast(int myDevID)
...@@ -275,7 +282,7 @@ void XMem::SetDeviceFast(int myDevID) ...@@ -275,7 +282,7 @@ void XMem::SetDeviceFast(int myDevID)
} }
/* /*
run in static mode run in the static mode
>> myIsStatic - specify if the memory allocation is static >> myIsStatic - specify if the memory allocation is static
*/ */
void XMem::SetStaticMode(bool myIsStatic) void XMem::SetStaticMode(bool myIsStatic)
...@@ -1488,4 +1495,179 @@ cublasHandle_t * XMem::GetCublasHandle() ...@@ -1488,4 +1495,179 @@ cublasHandle_t * XMem::GetCublasHandle()
#endif #endif
/* constructor */
XMemManager::XMemManager()
{
Initialize();
}
/* de-constructor */
XMemManager::~XMemManager()
{
}
/* get memory size */
MTYPE XMemManager::GetAvailableMemory()
{
unsigned long freeMem = 0;
#if __APPLE__
int mib[2] = {CTL_HW, HW_MEMSIZE};
unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
unsigned long long size;
size_t len = sizeof(size);
if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
ShowNTErrors("Cannot get memory size on Mac!");
}
else{
return size;
}
#elif _WIN32
MEMORYSTATUSEX memoryStatus;
memoryStatus.dwLength = sizeof(memoryStatus);
if (GlobalMemoryStatusEx(&memoryStatus)){
freeMem = memoryStatus.ullAvailPhys;
}
#else
long pages = sysconf(_SC_AVPHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
freeMem = pages * page_size;
#endif
return (MTYPE)freeMem;
}
/* get GPU memory size */
MTYPE XMemManager::GetAvailableGPUMemory(int devID)
{
size_t freeMem = 0;
#ifdef USE_CUDA
size_t totalMem = 0;
cudaSetDevice(devID);
if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
XPRINT(0, stderr, "cannot get GPU memory information.");
exit(1);
}
#endif
return (MTYPE)freeMem;
}
/* get buffer size */
void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
{
*myBufSize = 0;
if (freeMem >= MILLION * 128){
*myBufSize = MILLION * 32;
if (freeMem >= MILLION * 256){
*myBufSize = MILLION * 64;
if (freeMem >= MILLION * 512){
*myBufSize = MILLION * 128;
if (freeMem >= MILLION * 1024) {
*myBufSize = MILLION * 256;
if (freeMem >= MILLION * 2048)
*myBufSize = MILLION * 512;
}
}
}
}
}
/* initialize it and set the global memory information */
void XMemManager::Initialize()
{
srand((unsigned int)time(NULL));
Free();
/* CPUs (we actually do not care about how many CPUs are using) */
nCPUMem = 1;
/* GPUs */
nGPUMem = 0;
#ifdef USE_CUDA
if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
XPRINT(0, stderr, "cannot get GPU information.");
exit(1);
}
#endif
}
/* free it */
void XMemManager::Free()
{
for (int i = 0; i < MAX_CPU_NUM; i++)
CPUMems[i].Free();
for (int i = 0; i < MAX_GPU_NUM; i++)
GPUMems[i].Free();
}
/* get global memory pool */
XMem * XMemManager::GetMem(const int devID)
{
XMem * mem = NULL;
if (devID < 0){
if(!CPUMems[0].isInitialized){
MTYPE freeMem = GetAvailableMemory();
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
CPUMems[0].Initialize(-1, FREE_ON_THE_FLY,
MIN_BLOCK_SIZE_FOR_MEMPOOL,
MIN_BLOCK_NUM_FOR_MEMPOOL,
myBufSize);
}
mem = CPUMems;
}
else{
if (devID < nGPUMem){
if(!GPUMems[devID].isInitialized){
MTYPE freeMem = GetAvailableGPUMemory(devID);
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY,
MIN_BLOCK_SIZE_FOR_MEMPOOL,
MIN_BLOCK_NUM_FOR_MEMPOOL,
myBufSize);
}
mem = GPUMems + devID;
}
else{
XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
}
}
return mem;
}
/* get global memory size */
int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
{
XMem * mem = GetMem(devID);
int result = 0;
if (mem != NULL){
*myBlockSize = mem->maxBlockSize;
*myBlockNum = mem->blockNum;
*myBufSize = mem->bufSize;
result = 1;
}
return result;
}
/* show memory information */
void XMemManager::ShowMemInfo()
{
XPRINT(1, stderr, "Memory Information:\n");
MTYPE myBlockSize, myBufSize;
int myBlockNum;
for(int i = 0; i < nCPUMem; i++){
GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
}
for(int i = 0; i < nGPUMem; i++){
GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
...@@ -39,6 +39,15 @@ ...@@ -39,6 +39,15 @@
#include <curand.h> #include <curand.h>
#endif #endif
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#elif WIN32
#include <windows.h>
#else
#include <unistd.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts{ namespace nts{
...@@ -53,6 +62,8 @@ typedef long long INT_64; ...@@ -53,6 +62,8 @@ typedef long long INT_64;
#define BUF_PITCH 256 #define BUF_PITCH 256
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_NUM_FOR_MEMPOOL 1024 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
#define MAX_CPU_NUM 16
#define MAX_GPU_NUM 16
/* /*
mode of runnig a memory pool mode of runnig a memory pool
...@@ -202,6 +213,9 @@ public: ...@@ -202,6 +213,9 @@ public:
MTYPE curUsedPin; MTYPE curUsedPin;
MTYPE bufUsedPin; MTYPE bufUsedPin;
/* indicates whether the memory pool is initialized */
bool isInitialized;
#ifdef USE_CUDA #ifdef USE_CUDA
/* handle used for cublas */ /* handle used for cublas */
cublasHandle_t cublasHandle; cublasHandle_t cublasHandle;
...@@ -413,6 +427,61 @@ public: ...@@ -413,6 +427,61 @@ public:
}; };
/*
a class for the management of memory
*/
class XMemManager
{
private:
/* cpu memory pool information */
XMem CPUMems[MAX_CPU_NUM];
/* number of cpu memory pools */
int nCPUMem;
/* gpu memory pool information */
XMem GPUMems[MAX_GPU_NUM];
/* number of gpu memory pools */
int nGPUMem;
public:
/* constructor */
XMemManager();
/* de-constructor */
~XMemManager();
/* get memory size */
MTYPE GetAvailableMemory();
/* get GPU memory size */
MTYPE GetAvailableGPUMemory(int devID);
/* get buffer size */
void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
/* initialize it and set the global memory information */
void Initialize();
/* free it */
void Free();
/* get global memory pool */
XMem * GetMem(const int devID);
/* get global memory size */
int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
/* show memory information */
void ShowMemInfo();
};
/* managing the memories */
extern XMemManager GMems;
extern XMem * GMem; extern XMem * GMem;
extern int testxmemid; extern int testxmemid;
......
...@@ -108,18 +108,10 @@ const char * GetOPName(int type) ...@@ -108,18 +108,10 @@ const char * GetOPName(int type)
else if (type == REDUCE_REDUCEVARIANCE) else if (type == REDUCE_REDUCEVARIANCE)
return "R_REDUCEVARIANCE"; return "R_REDUCEVARIANCE";
} }
else if ((type & DATA_BASE) != 0) { else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_CONVERTDATATYPE) if (type == GETANDSET_SELECT)
return "G_CONVERTDATATYPE";
else if (type == GETANDSET_INDEXTOONEHOT)
return "G_INDEXTOONEHOT";
else if (type == GETANDSET_ONEHOTTOINDEX)
return "G_ONEHOTTOINDEX";
else if (type == GETANDSET_SELECT)
return "G_SELECT"; return "G_SELECT";
} else if (type == MOVEMENT_COPYINDEXED)
else if ((type & SHAPE_BASE) != 0) {
if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED"; return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES) else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES"; return "M_COPYVALUES";
......
...@@ -79,13 +79,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -79,13 +79,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* data and shape related operations */ /* data and shape related operations */
#define DATA_BASE MATH_BASE * 2 #define DATA_BASE MATH_BASE * 2
#define GETANDSET DATA_BASE + 1 #define GETANDSET DATA_BASE + 1
#define GETANDSET_CONVERTDATATYPE GETANDSET + 1 #define GETANDSET_SELECT GETANDSET + 1
#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
#define GETANDSET_SELECT GETANDSET_ONEHOTTOINDEX + 1
#define SHAPE_BASE DATA_BASE * 2 #define MOVEMENT GETANDSET_SELECT + 1
#define MOVEMENT SHAPE_BASE + 1
#define MOVEMENT_COPYINDEXED MOVEMENT + 1 #define MOVEMENT_COPYINDEXED MOVEMENT + 1
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1 #define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1 #define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
...@@ -108,7 +104,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -108,7 +104,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define SORT_TOPK SORT_SORT + 1 #define SORT_TOPK SORT_SORT + 1
/* activation functions */ /* activation functions */
#define FUNCTION_BASE SHAPE_BASE * 2 #define FUNCTION_BASE DATA_BASE * 2
#define FUNC_DROPOUT FUNCTION_BASE + 1 #define FUNC_DROPOUT FUNCTION_BASE + 1
#define FUNC_HARDTANH FUNC_DROPOUT + 1 #define FUNC_HARDTANH FUNC_DROPOUT + 1
#define FUNC_IDENTITY FUNC_HARDTANH + 1 #define FUNC_IDENTITY FUNC_HARDTANH + 1
......
...@@ -146,7 +146,7 @@ run a set of jobs in parallel ...@@ -146,7 +146,7 @@ run a set of jobs in parallel
>> jobArgs - the list of arguments for each job >> jobArgs - the list of arguments for each job
>> sleepTime - time to sleep (in ms) for each round >> sleepTime - time to sleep (in ms) for each round
*/ */
void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime) void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
{ {
if(threadNum <= 0){ if(threadNum <= 0){
XPRINT(1, stderr, "Error! No threads were created!\n"); XPRINT(1, stderr, "Error! No threads were created!\n");
...@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime) ...@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c); TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
/* the arguments that are passed to the function */ /* the arguments that are passed to the function */
volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c); volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
/* thread */ /* thread */
XThread * thread = threads + availableThreads[i]; XThread * thread = threads + availableThreads[i];
......
...@@ -106,7 +106,7 @@ public: ...@@ -106,7 +106,7 @@ public:
void KillThreads(); void KillThreads();
/* run a set of jobs in parallel */ /* run a set of jobs in parallel */
void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0); void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
/* get the number of parallel jobs to run */ /* get the number of parallel jobs to run */
int GetJobNum(int size); int GetJobNum(int size);
......
...@@ -42,7 +42,7 @@ job item used in queues ...@@ -42,7 +42,7 @@ job item used in queues
JobQueueNode::JobQueueNode() JobQueueNode::JobQueueNode()
{ {
job = NULL; job = NULL;
args = new XList(1); args = new TensorList(1);
} }
/* de-constructor */ /* de-constructor */
...@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize) ...@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
head = 0; head = 0;
tail = 0; tail = 0;
isJobQueue = false; isJobQueue = false;
jobDequeuerArgs = new XList(1); jobDequeuerArgs = new TensorList(1);
jobDequeuerBreak = false; jobDequeuerBreak = false;
runningJobCount = 0; runningJobCount = 0;
jobStream = NULL; jobStream = NULL;
...@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID) ...@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true; isJobQueue = true;
jobDequeuerArgs->Clear(); jobDequeuerArgs->Clear();
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid); // warning: this may cause unknown error
jobDequeuerArgs->Add((XTensor*)this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
jobDequeuer.function = (TFunction)DequeueJobs; jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs; jobDequeuer.argv = jobDequeuerArgs;
...@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer() ...@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
} }
/* add a job item to process */ /* add a job item to process */
void XQueue::EnqueueJob(void * job, XList * jobArgs) void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
{ {
MUTEX_LOCK(jobQueueMutex); MUTEX_LOCK(jobQueueMutex);
runningJobCount++; runningJobCount++;
...@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs) ...@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
} }
/* job item consumer */ /* job item consumer */
void XQueue::DequeueJobs(XList * args) void XQueue::DequeueJobs(TensorList * args)
{ {
CheckNTErrors((args->count == 2), "Illegal arguments!"); CheckNTErrors((args->count == 2), "Illegal arguments!");
......
...@@ -52,7 +52,7 @@ public: ...@@ -52,7 +52,7 @@ public:
void * job; void * job;
/* arguments of the job */ /* arguments of the job */
XList * args; TensorList * args;
public: public:
/* constructor */ /* constructor */
...@@ -102,7 +102,7 @@ private: ...@@ -102,7 +102,7 @@ private:
XThread jobDequeuer; XThread jobDequeuer;
/* argument list of jobDequeuer */ /* argument list of jobDequeuer */
XList * jobDequeuerArgs; TensorList * jobDequeuerArgs;
/* indicates whether jobDequeuer stops */ /* indicates whether jobDequeuer stops */
bool jobDequeuerBreak; bool jobDequeuerBreak;
...@@ -141,11 +141,11 @@ public: ...@@ -141,11 +141,11 @@ public:
void StopJobConsumer(); void StopJobConsumer();
/* add a job item to process */ /* add a job item to process */
void EnqueueJob(void * job, XList * jobArgs); void EnqueueJob(void * job, TensorList * jobArgs);
/* job item consumer */ /* job item consumer */
static static
void DequeueJobs(XList * args); void DequeueJobs(TensorList * args);
/* get the break flag */ /* get the break flag */
bool GetJobBreak(); bool GetJobBreak();
......
...@@ -52,6 +52,7 @@ struct XLink; ...@@ -52,6 +52,7 @@ struct XLink;
#define MIN_TENSOR_MERGE_NUM 0 #define MIN_TENSOR_MERGE_NUM 0
#define MIN_TENSOR_MERGE_LIST_NUM 1024 #define MIN_TENSOR_MERGE_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8 #define MIN_TENSOR_CAT_NUM 8
#define MAX_TENSOR_NAME_SIZE 32
/* computation flags */ /* computation flags */
#define UNSAFE_BUT_FAST_MEM #define UNSAFE_BUT_FAST_MEM
...@@ -61,6 +62,9 @@ struct XLink; ...@@ -61,6 +62,9 @@ struct XLink;
struct XTensor struct XTensor
{ {
public: public:
/* name */
char name[MAX_TENSOR_NAME_SIZE];
/* id */ /* id */
int id; int id;
...@@ -190,13 +194,14 @@ public: ...@@ -190,13 +194,14 @@ public:
XTensor(const XTensor &reference); XTensor(const XTensor &reference);
/* copy constructor (with right value reference) */ /* copy constructor (with right value reference) */
#ifdef USE_CPP11
XTensor(const XTensor &&reference); XTensor(const XTensor &&reference);
#endif
/* de-constructor */ /* de-constructor */
~XTensor(); ~XTensor();
/* set the name of the tensor */
void SetName(const char * myName);
/* initialize member variables */ /* initialize member variables */
void Init(); void Init();
...@@ -210,9 +215,7 @@ public: ...@@ -210,9 +215,7 @@ public:
XTensor& operator= (const XTensor &tensor); XTensor& operator= (const XTensor &tensor);
/* overloading of the equal-sign (with right value reference) */ /* overloading of the equal-sign (with right value reference) */
#ifdef USE_CPP11
XTensor& operator= (const XTensor &&tensor); XTensor& operator= (const XTensor &&tensor);
#endif
/* overloading of the plus-sign */ /* overloading of the plus-sign */
XTensor operator+ (const XTensor &tensor) const; XTensor operator+ (const XTensor &tensor) const;
...@@ -241,6 +244,9 @@ public: ...@@ -241,6 +244,9 @@ public:
/* linear transformation */ /* linear transformation */
XTensor Lin(DTYPE scale, DTYPE shift = 0) const; XTensor Lin(DTYPE scale, DTYPE shift = 0) const;
/* relocate the data on the target device */
void SetDevice(int myDevId, XMem * myMem = NULL);
/* judge whether the two matrices are in the same type and size */ /* judge whether the two matrices are in the same type and size */
static static
bool IsSameShaped(const XTensor * a, const XTensor * b); bool IsSameShaped(const XTensor * a, const XTensor * b);
...@@ -268,6 +274,9 @@ public: ...@@ -268,6 +274,9 @@ public:
/* reshape the tensor to a matrix */ /* reshape the tensor to a matrix */
void Reshape(const int rowNum, const int colNum); void Reshape(const int rowNum, const int colNum);
/* reshape the tensor by merging two consecutive dimensions */
void ReshapeMerged(const int i, const int j = -1);
/* get the number of items in the data array */ /* get the number of items in the data array */
int GetSize() const; int GetSize() const;
...@@ -316,6 +325,9 @@ public: ...@@ -316,6 +325,9 @@ public:
/* get the value of a cell with the index */ /* get the value of a cell with the index */
DTYPE Get(int index[], int size = -1); DTYPE Get(int index[], int size = -1);
/* get the value of a cell with the offset */
DTYPE Get(int offset);
/* get the pointer to a cell */ /* get the pointer to a cell */
void * GetCell(int index[], int size = -1) const; void * GetCell(int index[], int size = -1) const;
...@@ -328,6 +340,9 @@ public: ...@@ -328,6 +340,9 @@ public:
/* get the default type value of a cell in a 3d tensor */ /* get the default type value of a cell in a 3d tensor */
DTYPE Get3D(int d0, int d1, int d2); DTYPE Get3D(int d0, int d1, int d2);
/* get the int value of a cell by its offset */
int GetInt(int offset);
/* get the int value of a cell in a 1d tensor */ /* get the int value of a cell in a 1d tensor */
int Get1DInt(int i); int Get1DInt(int i);
...@@ -346,6 +361,9 @@ public: ...@@ -346,6 +361,9 @@ public:
/* set the value of a cell */ /* set the value of a cell */
bool Set(DTYPE value, int index[], int size = -1); bool Set(DTYPE value, int index[], int size = -1);
/* set the value of a cell with its offset in the array */
bool Set(DTYPE value, int offset);
/* set the value of a cell in a 1d tensor */ /* set the value of a cell in a 1d tensor */
bool Set1D(DTYPE value, int i); bool Set1D(DTYPE value, int i);
...@@ -355,6 +373,9 @@ public: ...@@ -355,6 +373,9 @@ public:
/* set the value of a cell in a 3d tensor */ /* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2); bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell by its offset */
bool SetInt(int value, int offset);
/* set the integer value of a cell */ /* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1); bool SetInt(int value, int index[], int size = -1);
...@@ -430,65 +451,133 @@ void InitTensor(XTensor * tensor, ...@@ -430,65 +451,133 @@ void InitTensor(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL); const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense XTensor V2 */
void InitTensorV2(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* initialize a dense vector */ /* initialize a dense vector */
void InitTensor1D(XTensor * tensor, const int num, void InitTensor1D(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense vector V2 */
void InitTensor1DV2(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense matrix */ /* initialize a dense matrix */
void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum, void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense matrix V2 */
void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 3d tensor */ /* initialize a dense 3d tensor */
void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2, void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 3d tensor V2 */
void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 4d tensor */ /* initialize a dense 4d tensor */
void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 4d tensor V2 */
void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 5d tensor */ /* initialize a dense 5d tensor */
void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4, void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 5d tensor V2 */
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a tensor with a reference tensor */ /* initialize a tensor with a reference tensor */
void InitTensor(XTensor * tensor, const XTensor * reference); void InitTensor(XTensor * tensor, const XTensor * reference);
/* initialize a tensor with a reference tensor */
void InitTensorV2(XTensor * tensor, const XTensor * reference);
/* initialize a tensor on the CPU with a reference tensor */
void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
/* generate a XTensor with no initialization */
XTensor * NewTensor();
/* generate a XTensor */ /* generate a XTensor */
XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT, XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL); const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense XTensor V2 */
XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a XTensor which allocates data on the buffer */ /* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
const int myDevID = -1, XMem * myMem = NULL); const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* generate a XTensor which allocates data on the buffer */ /* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem); XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);
/* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const XTensor * reference, int devID);
/* generate a dense vector */ /* generate a dense vector */
XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1,
XMem * myMem = NULL); XMem * myMem = NULL);
/* generate a dense vector V2 */
XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* generate a dense matrix */ /* generate a dense matrix */
XTensor * NewTensor2D(const int rowNum, const int colNum, XTensor * NewTensor2D(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL); const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense matrix V2 */
XTensor * NewTensor2DV2(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 3d tensor */ /* generate a dense 3d tensor */
XTensor * NewTensor3D(const int d0, const int d1, const int d2, XTensor * NewTensor3D(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL); const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 3d tensor V2 */
XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 4d tensor */ /* generate a dense 4d tensor */
XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3, XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL); const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 4d tensor V2 */
XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 5d tensor */ /* generate a dense 5d tensor */
XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4, XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL); const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 5d tensor V2 */
XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a copy of XTensor (with a reference to a given tensor) */ /* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(const XTensor * a, bool isFilledData = true); XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
......
...@@ -85,7 +85,7 @@ namespace nts{ ...@@ -85,7 +85,7 @@ namespace nts{
#endif #endif
typedef void (*TFunction) (volatile XList*); typedef void (*TFunction) (volatile TensorList*);
/* /*
This is a class that wraps the standard implementation of threading This is a class that wraps the standard implementation of threading
...@@ -133,7 +133,7 @@ public: ...@@ -133,7 +133,7 @@ public:
/* arguments (for the function to run) */ /* arguments (for the function to run) */
volatile volatile
XList * argv; TensorList * argv;
/* a flag to break */ /* a flag to break */
volatile volatile
......
...@@ -97,4 +97,5 @@ ...@@ -97,4 +97,5 @@
#include "utilities/XMatrixSegment.h" #include "utilities/XMatrixSegment.h"
#include "utilities/FlushToMem.h" #include "utilities/FlushToMem.h"
#include "../function/DropoutWithIndex.h"
#endif // __CHEADER_H__ #endif // __CHEADER_H__
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "Div.h" #include "Div.h"
#include "Div.cuh" #include "Div.cuh"
#include "DivDim.h" #include "DivDim.h"
...@@ -41,12 +42,15 @@ where i is the index of the item ...@@ -41,12 +42,15 @@ where i is the index of the item
*/ */
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!"); "Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaDiv(a, b, c, alpha, leadingDim); _CudaDiv(a, b, c, alpha, leadingDim);
......
...@@ -19,10 +19,12 @@ ...@@ -19,10 +19,12 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/ */
#include <math.h>
#include "Div.h" #include "Div.h"
#include "DivDim.h" #include "DivDim.h"
#include "DivDim.cuh" #include "DivDim.cuh"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting ...@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting
*/ */
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{ {
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!"); CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!"); CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType, CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
...@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp ...@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!"); CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!"); CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(XTensor::IsSameShaped(a, b)){ if(XTensor::IsSameShaped(a, b)){
_Div(a, b, c, alpha); _Div(a, b, c, alpha);
return; return;
...@@ -152,6 +158,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha) ...@@ -152,6 +158,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Div function */ /* call _Div function */
_DivDim(&a, &b, &c, n, alpha); _DivDim(&a, &b, &c, n, alpha);
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/ */
#include "DivDim.cuh" #include "DivDim.cuh"
...@@ -169,34 +168,6 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE ...@@ -169,34 +168,6 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
} }
else if (a->dataType == X_FLOAT16) {
half alpha1 = __float2half(alpha);
if (stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
else
KernelDivWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
}
else if (stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
else
KernelDivWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks - * I'll attend several conferences and workshops in the following weeks -
* busy days :( * busy days :(
*/ */
#ifndef __MASK_H__ #ifndef __MASK_H__
#define __MASK_H__ #define __MASK_H__
...@@ -55,4 +55,3 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0); ...@@ -55,4 +55,3 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MASK_H__ #endif // __MASK_H__
...@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner) XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{ {
CheckNTErrors(a && b && c, "Empty input tensors!"); CheckNTErrors(a && b && c, "Empty input tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Input tensors should have the same data type!");
CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2, CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
"Input tensors must have a order >= 2!"); "Input tensors must have a order >= 2!");
CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order") CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
...@@ -106,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -106,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cBlockNum *= b->dimSizeRDI[i]; cBlockNum *= b->dimSizeRDI[i];
} }
XList * aList = new XList(10); TensorList * aList = new TensorList(10);
XList * bList = new XList(10); TensorList * bList = new TensorList(10);
XList * cList = new XList(10); TensorList * cList = new TensorList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] }; int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] }; int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] }; int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
......
...@@ -38,17 +38,23 @@ argument5: matrix a ...@@ -38,17 +38,23 @@ argument5: matrix a
argument6: matrix b argument6: matrix b
argument7: matrix c (c=a*b*\alpha + c*beta) argument7: matrix c (c=a*b*\alpha + c*beta)
*/ */
void _MatrixMul2DMultiTheading(XList * args) void _MatrixMul2DMultiTheading(TensorList * args)
{ {
int x1 = *(int*)args->GetItem(0); CheckNTErrors(args->count == 2, "invalid argument number!");
int y1 = *(int*)args->GetItem(1); IntList * indexArgs = (IntList*)args->GetItem(0);
int x2 = *(int*)args->GetItem(2); TensorList * matrixArgs = (TensorList*)args->GetItem(1);
int y2 = *(int*)args->GetItem(3); CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
XTensor * a = (XTensor*)args->GetItem(4); CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
XTensor * b = (XTensor*)args->GetItem(5);
XTensor * c = (XTensor*)args->GetItem(6); XTensor * a = matrixArgs->GetItem(0);
DTYPE alpha = *(DTYPE*)args->GetItem(7); XTensor * b = matrixArgs->GetItem(1);
DTYPE beta = *(DTYPE*)args->GetItem(8); XTensor * c = matrixArgs->GetItem(2);
DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
int x1 = indexArgs->GetItem(0);
int y1 = indexArgs->GetItem(1);
int x2 = indexArgs->GetItem(2);
int y2 = indexArgs->GetItem(3);
#ifdef FAST_MATRIX #ifdef FAST_MATRIX
int am = a->dimSize[1]; int am = a->dimSize[1];
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication for a block (x1,y1) - (x2,y2) matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/ */
void _MatrixMul2DMultiTheading(XList * args); void _MatrixMul2DMultiTheading(TensorList * args);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1] ...@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> alpha - scalar >> alpha - scalar
>> beta - scalar >> beta - scalar
*/ */
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, const TensorList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta) TensorList * c, DTYPE alpha, DTYPE beta)
{ {
CheckNTErrors(a && b && c, "Empty input lists!"); CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!"); CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
......
...@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons ...@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs) matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
optimized for GPU optimized for GPU
*/ */
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/* /*
matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "Multiply.h" #include "Multiply.h"
#include "Multiply.cuh" #include "Multiply.cuh"
#include "MultiplyDim.h" #include "MultiplyDim.h"
...@@ -41,12 +42,15 @@ where i is the index of the item ...@@ -41,12 +42,15 @@ where i is the index of the item
*/ */
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!"); "Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMultiply(a, b, c, alpha, leadingDim); _CudaMultiply(a, b, c, alpha, leadingDim);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14 * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/ */
#include <math.h>
#include "Multiply.h" #include "Multiply.h"
#include "MultiplyDim.h" #include "MultiplyDim.h"
#include "MultiplyDim.cuh" #include "MultiplyDim.cuh"
...@@ -42,7 +43,9 @@ i.e., a is multiplied with b by broadcasting ...@@ -42,7 +43,9 @@ i.e., a is multiplied with b by broadcasting
>> n - the dimension index >> n - the dimension index
>> alpha - the scaling factor >> alpha - the scaling factor
*/ */
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) { void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!"); CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!"); CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
...@@ -52,6 +55,8 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP ...@@ -52,6 +55,8 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!"); CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!"); CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(XTensor::IsSameShaped(a, b)){ if(XTensor::IsSameShaped(a, b)){
_Multiply(a, b, c, alpha); _Multiply(a, b, c, alpha);
return; return;
...@@ -151,6 +156,8 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n) ...@@ -151,6 +156,8 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Multiply function */ /* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, 0); _MultiplyDim(&a, &b, &c, n, 0);
......
...@@ -44,6 +44,8 @@ void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -44,6 +44,8 @@ void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType, CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!"); "Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -19,10 +19,12 @@ ...@@ -19,10 +19,12 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/ */
#include <math.h>
#include "Sub.h" #include "Sub.h"
#include "SubDim.h" #include "SubDim.h"
#include "SubDim.cuh" #include "SubDim.cuh"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -42,6 +44,8 @@ i.e., a is subtracted with b by broadcasting ...@@ -42,6 +44,8 @@ i.e., a is subtracted with b by broadcasting
*/ */
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta) void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{ {
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!"); CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!"); CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType, CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
...@@ -50,6 +54,8 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet ...@@ -50,6 +54,8 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!"); CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!"); CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if (beta == 0) { if (beta == 0) {
_CopyValues(a, c); _CopyValues(a, c);
return; return;
...@@ -152,6 +158,8 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -152,6 +158,8 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Sub function */ /* call _Sub function */
_SubDim(&a, &b, &c, n, beta); _SubDim(&a, &b, &c, n, beta);
......
...@@ -45,6 +45,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -45,6 +45,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType, CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!"); "Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if(beta == 0){ if(beta == 0){
_CopyValues(a, c); _CopyValues(a, c);
return; return;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
* Add summation by broadcasting. * Add summation by broadcasting.
*/ */
#include <math.h>
#include "Sum.h" #include "Sum.h"
#include "SumDim.h" #include "SumDim.h"
#include "SumDim.cuh" #include "SumDim.cuh"
...@@ -46,6 +47,8 @@ i.e., a is summed with b by broadcasting ...@@ -46,6 +47,8 @@ i.e., a is summed with b by broadcasting
*/ */
void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta) void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{ {
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!"); CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!"); CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType, CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
...@@ -54,6 +57,8 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet ...@@ -54,6 +57,8 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!"); CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!"); CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(beta == 0){ if(beta == 0){
_CopyValues(a, c); _CopyValues(a, c);
return; return;
...@@ -170,6 +175,8 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -170,6 +175,8 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
n = MODX(n, a.order);
/* call _SumDim function */ /* call _SumDim function */
_SumDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, beta);
......
...@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, void _CudaBLASMatrixMULList(cublasHandle_t * handle, const TensorList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, const TensorList * b, MATRIX_TRANS_TYPE transposedB, TensorList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
#endif #endif
......
...@@ -20,21 +20,20 @@ ...@@ -20,21 +20,20 @@
*/ */
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XName.h"
#include "ConvertDataType.h" #include "ConvertDataType.h"
#include "ConvertDataType.cuh" #include "ConvertDataType.cuh"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
convert data type convert data type
>> input - input tensor >> input - input tensor
>> output - output tensor >> output - output tensor
*/ */
void _ConvertDataType(const XTensor * input, XTensor * output) void _ConvertDataType(const XTensor * input, XTensor * output)
{ {
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType) if (input->dataType == output->dataType)
return; return;
...@@ -62,35 +61,4 @@ void _ConvertDataType(const XTensor * input, XTensor * output) ...@@ -62,35 +61,4 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
ShowNTErrors("Unsupported data types for conversion!"); ShowNTErrors("Unsupported data types for conversion!");
} }
/*
convert data type (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
<< return - output tensor with the specified data type
*/
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
{
if (input.dataType == dataType) {
XTensor output;
output = CopyValues(input);
return output;
}
int order = input.order;
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
output.SetTMPFlag();
_ConvertDataType(&input, &output);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
return output;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -67,49 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size) ...@@ -67,49 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
if (i < size){ if (i < size){
outputData[i] = (float)(inputData[i]); outputData[i] = (float)(inputData[i]);
} }}
}
__global__
void KernelFloatToInt8(float * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (float)(inputData[i]);
}
}
__global__
void KernelIntToInt8(int * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToInt(__int8 * inputData, int * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (int)(inputData[i]);
}
}
/* /*
data conversion (cuda code) data conversion (cuda code)
...@@ -180,14 +138,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output) ...@@ -180,14 +138,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum); KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT) else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum); KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
KernelInt8ToFloat << <blocks, threads >> >((__int8*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_INT && output->dataType == X_INT8)
KernelIntToInt8 << <blocks, threads >> >((int*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_INT)
KernelInt8ToInt << <blocks, threads >> >((__int8*)input->data, (int*)output->data, input->unitNum);
else{ else{
ShowNTErrors("Unsupported data types for conversion!"); ShowNTErrors("Unsupported data types for conversion!");
} }
......
...@@ -23,16 +23,12 @@ ...@@ -23,16 +23,12 @@
#define __CONVERTDATATYPE_H__ #define __CONVERTDATATYPE_H__
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* convert data type */ /* convert data type */
void _ConvertDataType(const XTensor * input, XTensor * output); void _ConvertDataType(const XTensor * input, XTensor * output);
/* convert data type (return an XTensor structure) */
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __CONVERTDATATYPE_H__ #endif // __CONVERTDATATYPE_H__
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "OnehotAndIndex.h" #include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh" #include "OnehotAndIndex.cuh"
#include "SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -32,65 +31,43 @@ convert onehot tensor to index tensor ...@@ -32,65 +31,43 @@ convert onehot tensor to index tensor
>> index - index tensor, which value is an integer num >> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim) void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
{ {
dim = (dim < 0 ? onehot->GetDim(-1) : dim); CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!"); CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!") CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!") CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++) { for (int i = 0; i < index->order; i++)
if (i < dim) {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!"); CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
}
else {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
}
}
#ifdef USE_CUDA #ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) { if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, dim); _CudaOnehotToIndex(onehot, index, size);
return; return;
} }
#endif #endif
int blockNum = 1; int blockNum = index->unitNum;
int blockSize = 1; int stride = size;
int dimSize = 1;
int stride = 1;
for (int i = 0; i < dim; i++)
blockNum *= onehot->GetDim(i);
blockSize = onehot->unitNum / blockNum;
dimSize = onehot->GetDim(dim);
for (int i = dim + 1; i < onehot->order; i++)
stride *= onehot->GetDim(i);
int * onehotData = (int *)onehot->data; int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data; int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
for (int j = 0; j < stride; j++) { int * od = onehotData + i * stride;
int * od = onehotData + i * blockSize + j;
int * index = indexData + i * stride + j;
int record = -1; int record = -1;
for (int j = 0; j < dimSize; j++) { for (int j = 0; j < stride; j++) {
if (od[j*stride] != 0) { if (od[j] != 0) {
if (record == -1) if (record == -1)
record = j; record = j;
else else
ShowNTErrors("The value of onehot tensor is illegal!"); ShowNTErrors("The value of onehot tensor is illegal!");
} }
} }
*index = record; indexData[i] = record;
}
} }
} }
/* /*
...@@ -122,7 +99,7 @@ convert index tensor to onehot tensor ...@@ -122,7 +99,7 @@ convert index tensor to onehot tensor
>> onehot - onehot tensor, which value is 0 or 1 >> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP) void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
{ {
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!"); CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!"); CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
...@@ -134,12 +111,9 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab ...@@ -134,12 +111,9 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
onehot->SetZeroAll(); onehot->SetZeroAll();
#ifdef USE_CUDA
float confidence = 1 - labelSmoothingP; float confidence = 1 - labelSmoothingP;
float lowconfidence = labelSmoothingP / size; float lowconfidence = labelSmoothingP / size;
//_SetDataFixedFloat(onehot, lowconfidence);
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) { if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence); _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
return; return;
...@@ -155,7 +129,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab ...@@ -155,7 +129,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
int id = indexData[i]; int id = indexData[i];
DTYPE * od = onehotData + i * stride; DTYPE * od = onehotData + i * stride;
od[id] = confidence; od[id] = 1;
} }
} }
......
...@@ -110,8 +110,9 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int ...@@ -110,8 +110,9 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int
DTYPE * od = onehotData + i * stride; DTYPE * od = onehotData + i * stride;
int id = indexData[i]; int id = indexData[i];
//od[id] = confidence;
//od[id] = 2.0;
//onehotData[i * stride + id] = 0.1;
if (offset == id) if (offset == id)
od[offset] = confidence; od[offset] = confidence;
else{ else{
...@@ -126,7 +127,7 @@ convert index tensor to onehot tensor (cuda version) ...@@ -126,7 +127,7 @@ convert index tensor to onehot tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1 >> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence) void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
{ {
int devID = onehot->devID; int devID = onehot->devID;
......
...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size); void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */ /* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence); void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,18 +27,18 @@ ...@@ -27,18 +27,18 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */ /* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim); void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert onehot tensor to index tensor (return an XTensor structure) /* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int size); XTensor OnehotToIndex(XTensor & onehot, int num);
/* convert index tensor to onehot tensor */ /* convert index tensor to onehot tensor */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP = 0.0F); void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor (return an XTensor structure) /* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP = 0.0F); XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
* I'm surprised that I did not write this file till today. * I'm surprised that I did not write this file till today.
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
*/ */
#ifndef __SETDATA_CUH__ #ifndef __SETDATA_CUH__
...@@ -29,9 +28,22 @@ ...@@ -29,9 +28,22 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a fixed value p (in int, float, float16, double) */ /* generate data items with a fixed value p (in int) */
template<class T> void _CudaSetDataFixedInt(XTensor * tensor, int p);
void _CudaSetDataFixed(XTensor * tensor, T p);
/* generate data items with a fixed value p (in float) */
void _CudaSetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void _CudaSetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p (in float) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
/* generate data items with a fixed value p (in int) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */ /* set data items along with a given dimension (and keep the remaining items unchanged) */
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p); void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
...@@ -43,11 +55,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index) ...@@ -43,11 +55,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift); void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set /* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */ the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value); void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */ /* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num); void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
...@@ -24,19 +24,35 @@ ...@@ -24,19 +24,35 @@
#define __SETDATA_H__ #define __SETDATA_H__
#include "../../XTensor.h" #include "../../XTensor.h"
#include "SetData.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a xavier initialization */ /* generate data items with a xavier initialization */
void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F); void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
///* generate data items with a fixed value p */ /* generate data items with a fixed value p */
//void _SetDataFixed(XTensor * tensor, void * valuePointer); void _SetDataFixed(XTensor * tensor, void * valuePointer);
/* generate data items with a fixed value p (in default type) */ /* generate data items with a fixed value p (in default type) */
template<class T> void SetDataFixed(XTensor &tensor, DTYPE p);
void _SetDataFixed(XTensor * tensor, T value);
/* generate data items with a fixed value p (in integer) */
void SetDataFixedInt(XTensor &tensor, int p);
/* generate data items with a fixed value p (in int) */
void _SetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
void _SetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void _SetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */ /* set data items along with a given dimension (and keep the remaining items unchanged) */
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p); void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
...@@ -48,11 +64,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index); ...@@ -48,11 +64,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift); void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set /* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */ the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value); void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */ /* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F); void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
...@@ -170,14 +170,10 @@ SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod) ...@@ -170,14 +170,10 @@ SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#else #else
/* define three marco separately, specify the respective function names (CPU mode) */ /* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc) \ #define _SIMPLE_BINARY_FUNCTION_INT(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, int num) \ void _funcName(const XTensor * a, XTensor * b, int num) \
{ \ { \
/* run it on GPUs */ \ CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \ CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \ "Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!"); \ CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!"); \
...@@ -187,14 +183,10 @@ void _funcName(const XTensor * a, XTensor * b, int num) \ ...@@ -187,14 +183,10 @@ void _funcName(const XTensor * a, XTensor * b, int num) \
db[i] = (int)origFunc(d[i], num); \ db[i] = (int)origFunc(d[i], num); \
} \ } \
#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \ #define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, float num) \ void _funcName(const XTensor * a, XTensor * b, float num) \
{ \ { \
/* run it on GPUs */ \ CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \ CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \ "Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\ CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
...@@ -228,34 +220,36 @@ void funcName(const XTensor &a, XTensor &b, float num) \ ...@@ -228,34 +220,36 @@ void funcName(const XTensor &a, XTensor &b, float num) \
_funcName(&a, &b, num); \ _funcName(&a, &b, num); \
} \ } \
_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(Scale, _Scale) _SIMPLE_BINARY_FUNCTION_INT(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale) SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale) _SIMPLE_BINARY_FUNCTION(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME(Scale, _Scale) SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION(Scale, _Scale) SIMPLE_BINARY_FUNCTION(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale) _SIMPLE_BINARY_FUNCTION_INT(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME_INT(Descale, _Descale) SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale) SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale) _SIMPLE_BINARY_FUNCTION(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME(Descale, _Descale) SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION(Descale, _Descale) SIMPLE_BINARY_FUNCTION(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift) _SIMPLE_BINARY_FUNCTION_INT(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME_INT(Shift, _Shift) SIMPLE_BINARY_FUNCTION_ME_INT(_Shift, _Shift)
SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift) SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift) _SIMPLE_BINARY_FUNCTION(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME(Shift, _Shift) SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION(Shift, _Shift) SIMPLE_BINARY_FUNCTION(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod) _SIMPLE_BINARY_FUNCTION_INT(_Mod, mod)
SIMPLE_BINARY_FUNCTION_ME_INT(Mod, _Mod) SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod) SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#endif #endif
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -104,7 +104,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num) \ ...@@ -104,7 +104,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num) \
((int*)a->data, (int*)b->data, a->unitNum, num); \ ((int*)a->data, (int*)b->data, a->unitNum, num); \
} \ } \
else { \ else { \
ShowNTErrors("TODOhaha!"); \ ShowNTErrors("TODO!"); \
} \ } \
\ \
BacktoCudaDev(a->devID, devIDBackup); \ BacktoCudaDev(a->devID, devIDBackup); \
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
...@@ -36,9 +35,8 @@ set each entry to its clip value (CUDA Kernel) ...@@ -36,9 +35,8 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border >> upper - the upper border
>> size - size of the data array >> size - size of the data array
*/ */
template <class T>
__global__ __global__
void KernelClip(T * a, T * b, T lower, T upper, int size) void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -53,6 +51,21 @@ void KernelClip(T * a, T * b, T lower, T upper, int size) ...@@ -53,6 +51,21 @@ void KernelClip(T * a, T * b, T lower, T upper, int size)
} }
/* /*
set each entry to its clip value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> lower - the lower border
>> upper - the upper border
>> size - size of the data array
*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
return;
}
/*
set each entry to its clip value set each entry to its clip value
>> a - input tensor we are processing >> a - input tensor we are processing
>> b - output tensor we are processing >> b - output tensor we are processing
...@@ -79,22 +92,7 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -79,22 +92,7 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum); KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
} }
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
half lower1 = __float2half(lower); KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
half upper1 = __float2half(upper);
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT8) {
__int8 lower1 = (__int8)lower;
__int8 upper1 = (__int8)upper;
KernelClip << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, lower1, upper1, a->unitNum);
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -29,9 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,9 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */ /* set each entry to its clip value (CUDA Kernel) */
template <class T>
__global__ __global__
void KernelClip(T * a, T * b, T lower, T upper, int size); void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value */ /* set each entry to its clip value */
void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper); void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
......
...@@ -28,17 +28,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,17 +28,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* compare whether every entry is equal to the specified value (cuda kernel) */ /* check whether every entry is equal to the given value (cuda version) */
__global__ void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is equal to the specified value (cuda version) */ /* check whether every entry is not equal to the given value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number); void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);
/* compare whether every entry is not equal to the specified value (cuda kernel) */
__global__
void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is not equal to the specified value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -26,23 +26,23 @@ ...@@ -26,23 +26,23 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compare whether every entry is equal to the specified value */ /* check whether every entry is equal to the given value */
void _Equal(const XTensor * a, XTensor * b, DTYPE number); void _Equal(const XTensor * a, XTensor * b, DTYPE value);
/* compare whether every entry is equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */ /* check whether every entry is equal to the given value (do it on site) */
void _EqualMe(XTensor * a, DTYPE number); void _EqualMe(XTensor * a, DTYPE value);
/* compare whether every entry is equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */ /* check whether every entry is equal to the given value (return an XTensor structure) */
XTensor Equal(const XTensor & a, DTYPE number); XTensor Equal(const XTensor & a, DTYPE value);
/* compare whether every entry is not equal to the specified value */ /* check whether every entry is not equal to the given value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE number); void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
/* compare whether every entry is not equal to the specified value (do it on site)
keep the result in the input tensor a and return nothing */ /* check whether every entry is not equal to the given value (do it on site) */
void _NotEqualMe(XTensor * a, DTYPE number); void _NotEqualMe(XTensor * a, DTYPE value);
/* compare whether every entry is not equal to the specified value (return an XTensor structure)
make a new tensor to keep the result and return it */ /* check whether every entry is not equal to the given value (return an XTensor structure) */
XTensor NotEqual(const XTensor & a, DTYPE number); XTensor NotEqual(const XTensor & a, DTYPE value);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -138,12 +138,12 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen ...@@ -138,12 +138,12 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
_Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon); _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
/* tensor connections */ /* tensor connections */
XList list(5); TensorList list(5);
list.Add(&input); list.Add((XTensor*)&input);
list.Add(&mean); list.Add((XTensor*)&mean);
list.Add(&var); list.Add((XTensor*)&var);
list.Add(&a); list.Add((XTensor*)&a);
list.Add(&b); list.Add((XTensor*)&b);
XLink::MakeLink(&list, &output, MATH_NORMALIZE); XLink::MakeLink(&list, &output, MATH_NORMALIZE);
XLink::AddParamToHeadInt(&output, dim); XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, epsilon); XLink::AddParamToHead(&output, epsilon);
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int added
*/ */
#include "ScaleAndShift.cuh" #include "ScaleAndShift.cuh"
...@@ -35,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) ...@@ -35,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it >> scale - how much we want to scale it
>> shift - how much we want to shift it >> shift - how much we want to shift it
*/ */
template<class T, bool isUnitScale, bool isZeroShift> template<bool isUnitScale, bool isZeroShift>
__global__ __global__
void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift) void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -57,6 +56,28 @@ void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift) ...@@ -57,6 +56,28 @@ void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
} }
} }
/*
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
This is for float16 computation
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
__global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if(i < size)
b[i] = __hadd(__hmul(a[i], scale), shift);
#else
if (i < size)
b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
#endif
}
/* /*
scale and shift all tensor entires scale and shift all tensor entires
...@@ -87,52 +108,20 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift ...@@ -87,52 +108,20 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){ if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0) if(scale == 1.0F && shift == 0)
KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0) else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0) else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else else
KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
} }
else if(a->dataType == X_FLOAT16){ else if(a->dataType == X_FLOAT16){
half scale1 = __float2half(scale); unsigned short scale2 = FloatToFloat16(scale);
half shift1 = __float2half(shift); unsigned short shift2 = FloatToFloat16(shift);
__half * scaleft16p = (__half*)&scale2;
if (scale == 1.0F && shift == 0) __half * shiftft16p = (__half*)&shift2;
KernelScaleAndShift<__half, true, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1); KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__half, true, false><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__half, false, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else
KernelScaleAndShift<__half, false, false> << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
}
else if (a->dataType == X_INT){
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if (a->dataType == X_INT8){
__int8 scale2 = __int8(scale);
__int8 shift2 = __int8(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<__int8, true, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__int8, true, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__int8, false, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<__int8, false, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
} }
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论