Commit 3501c0fa by ltb

merge reducemax(float16) reducesum(float16) div(float16) negate(float16)…

merge reducemax(float16) reducesum(float16) div(float16) negate(float16) multiplydidm(float16) matrixmul(all,float16/int8) sum(float16/int/int8) sign/sub/sumdim/subdim( float16)
parent fc5a630a
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* backward computation for data operation
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
*/
#include "XNoder.h"
#include "XBackwardData.h"
#include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h"
namespace nts{
/* compute dE/dx of a node */
void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
{
CheckNTErrors(node->grad != NULL, "No gradient found!");
XLink &income = node->income;
int operID = income.typeID;
if(operID == GETANDSET_CONVERTDATATYPE)
GradConvertDataType(node, isEfficent);
else if(operID == GETANDSET_INDEXTOONEHOT)
GradIndexToOnehot(node, isEfficent);
else if(operID == GETANDSET_ONEHOTTOINDEX)
GradOnehotToIndex(node, isEfficent);
else{
ShowNTErrors("TODO!");
}
}
/* indicates whether the node is for a data operation */
bool XDataGrad::IsDataOP(XTensor * node)
{
XLink &income = node->income;
return (income.typeID & DATA_BASE) != 0;
}
/*
gradient computation for convert datatype
for
b = converdatatype(a)
we have
dE/da = convertdatatype(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
node->visitMark = NODE_FINISHED;
}
/*
gradient computation for OnehotToIndex
for
b = OnehotToIndex(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED;
}
/*
gradient computation for IndexToOnehot
for
b = IndexToOnehot(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -52,7 +52,15 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * dedy = output->grad;
if (income.tailNum == 1) {
_SetDataFixed(dedy, 1.0F);
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else
ShowNTErrors("TODO");
return;
}
......@@ -136,7 +144,15 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
LOSS_FUNCTION_NAME lossName)
{
if(gold == NULL){
_SetDataFixed(dedy, 1.0F);
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return;
}
......
......@@ -735,7 +735,7 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
_Multiply(node->grad, a, b->grad, 1.0F);;
_Multiply(node->grad, a, b->grad, 1.0F);
}
node->visitMark = NODE_FINISHED;
......@@ -855,7 +855,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
......@@ -1319,7 +1318,7 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
//DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
......
......@@ -68,7 +68,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
bool XShapeGrad::IsShapeOP(XTensor * node)
{
XLink &income = node->income;
return (income.typeID & SHAPE_BASE) != 0;
return (income.typeID & DATA_BASE) != 0;
}
/* post processing of a node */
......@@ -271,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
XTensor * last = NULL;
XList smalls(income.tailNum);
XList smallsGrad(income.tailNum);
TensorList smalls(income.tailNum);
TensorList smallsGrad(income.tailNum);
bool mergeOnly = true;
for(int i = 0; i < income.tailNum; i++){
XTensor * tail = income.tails[i];
......@@ -440,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
/* we compute the gradient for current node, rather than for
child node, i.e., we use the outgoing edge here */
XLink &outgo = node->outgo;
XList splits(outgo.tailNum);
TensorList splits(outgo.tailNum);
int whereToSplit = -1;
int splitNum = 0;
......
......@@ -54,7 +54,7 @@ private:
static
void GradGather(XTensor * node, bool isEfficent);
/* gradient computation for dropout with indexs */
/* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
static
void GradDropoutWithIndex(XTensor * node, bool isEfficent);
......
......@@ -37,16 +37,16 @@ struct XNet
unsigned int id;
/* tensor nodes of the network (in order) */
XList nodes;
TensorList nodes;
/* tensor nodes to keep gradient for output (e.g., SGD)*/
XList gradNodes;
TensorList gradNodes;
/* output nodes of the network */
XList outputs;
TensorList outputs;
/* input nodes of the network */
XList inputs;
TensorList inputs;
/* indicates whether the network just keeps the gradient for parameter tensors */
bool isGradEfficient;
......@@ -71,15 +71,15 @@ struct XNet
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false);
......@@ -93,10 +93,10 @@ struct XNet
/* traverse the net and find the topological order by
depth-first search (Tarjan's algorithm) */
void Traverse(XList &roots);
void Traverse(TensorList &roots);
/* depth-first search given a node (Tarjan's algorithm for topological ordering) */
void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);
/* dump network information */
void Dump(FILE * file);
......
......@@ -247,13 +247,13 @@ void Check(FNNModel &model)
/* make a hard copy of the fnn model */
void Copy(FNNModel &tgt, FNNModel &src)
{
InitTensor(&tgt.embeddingW, &src.embeddingW);
InitTensorV2(&tgt.embeddingW, &src.embeddingW);
for(int i = 0; i < MAX_HIDDEN_NUM; i++){
InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
}
InitTensor(&tgt.outputW, &src.outputW);
InitTensor(&tgt.outputB, &src.outputB);
InitTensorV2(&tgt.outputW, &src.outputW);
InitTensorV2(&tgt.outputB, &src.outputB);
tgt.n = src.n;
tgt.eSize = src.eSize;
......@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
*/
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
{
InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem);
InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
}
/*
......@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
*/
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
{
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem);
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
}
......@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the gold standard */
XTensor gold;
/* the loss tensor */
XTensor lossTensor;
/* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
......@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* forward computation */
Forward(inputs, output, model, net);
/* backward computation to obtain gradients */
Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
......@@ -481,11 +486,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
///* this is implemented by multiply function */
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
autoDiffer.Backward(lossTensor);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update(model, grad, learningRate, true);
......@@ -493,8 +500,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* get probabilities */
float prob = GetProb(output, gold);
loss += -prob;
prob = ReduceSumAll(lossTensor);
loss += prob;
wordCount += ngramNum;
wordCountTotal += ngramNum;
......@@ -537,8 +546,8 @@ update the model parameters using the delta rule
*/
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
{
XList paraList(10);
XList gradList(10);
TensorList paraList(10);
TensorList gradList(10);
paraList.Add(&model.outputW);
paraList.Add(&model.outputB);
......@@ -595,14 +604,14 @@ get prediction probabilites of the gold words
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
{
XTensor probs;
InitTensor(&probs, &output);
InitTensorV2(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&output, &gold, &probs);
/* probability of each word */
XTensor wprobs;
InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem);
InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
......@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
/* probability for the batch */
XTensor result;
InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem);
InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
......@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise.
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
tensor.SetZeroAll();
......@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int batchSize = -1;
int n = model.n;
int depth = model.hDepth;
XList eList(n - 1);
TensorList eList(n - 1);
/* previoius n - 1 words */
for(int i = 0; i < n - 1; i++){
......@@ -783,7 +792,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(embedding, batchSize, model.eSize, model);
/* generate word embedding of position i:
embedding = input * w */
embedding = input * w */
_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
eList.Add(&net.embeddings[i]);
......@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* make a 2d tensor for the bias term */
XTensor b2D;
InitTensor(&b2D, &s);
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
/* introduce bias term:
......@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor b2D;
InitTensor(&b2D, &s);
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s);
......@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
XTensor dedsHidden;
XTensor dedxBottom;
if (depth > 0)
InitTensor(&dedsHidden, &dedx);
InitTensor(&dedxBottom, &net.embeddingCat);
InitTensorV2(&dedsHidden, &dedx);
InitTensorV2(&dedxBottom, &net.embeddingCat);
/* back-propagation from top to bottom in the stack of hidden layers
for each layer, h = f(s)
......@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
_CopyValues(&dedx, &gradPassed);
}
XList eList(n - 1);
TensorList eList(n - 1);
/* back-propagation for the embedding layer */
for (int i = 0; i < n - 1; i++) {
XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem);
XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
eList.Add(dedy);
}
......@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
}
}
InitTensor1D(&words, size, X_INT, model.devID, model.mem);
InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words);
......@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/*
......@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
XTensor hidden;
XTensor b;
XList inputList(n - 1);
TensorList inputList(n - 1);
for(int i = 0; i < n - 1; i++)
inputList.Add(inputs + i);
......@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model)
/* prediction probabilities */
XTensor probs;
InitTensor1D(&probs, ngramNum);
InitTensor1DV2(&probs, ngramNum);
/* get probabilities */
float prob = GetProb(output, gold, &probs);
......@@ -1200,7 +1210,6 @@ void Test(const char * test, const char * result, FNNModel &model)
}
fclose(file);
fclose(ofile);
double elapsed = GetClockSec() - startT;
......
......@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
XTensor q2;
XTensor v2;
XTensor kqv2;
XList split;
TensorList split;
kqv2 = MMul(kqv, wbig);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but i'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 16
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(int argc, char ** argv);
/* load data to buffer */
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
};
}
#endif
\ No newline at end of file
......@@ -31,6 +31,10 @@ namespace transformer
/* constructor */
AttDecoder::AttDecoder()
{
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
......@@ -38,6 +42,10 @@ AttDecoder::AttDecoder()
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentions;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
......@@ -68,7 +76,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
......@@ -159,6 +167,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
x.SetName(DECODING_NAME);
return x;
}
......
......@@ -26,6 +26,9 @@
namespace transformer
{
#define DECODING_NAME "decoding"
#define DECODING_INPUT_NAME "decoding_input"
class AttDecoder
{
......
......@@ -139,6 +139,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
x.SetName(ENCODING_NAME);
input.SetName(ENCODING_INPUT_NAME);
return x;
}
......
......@@ -32,6 +32,9 @@ using namespace nts;
namespace transformer
{
#define ENCODING_NAME "encoding"
#define ENCODING_INPUT_NAME "encoding_input"
/*
base class of the encoder
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> alpha - the parameter controls the length preference
<< return - length penaltyof the sequence (for each entry)
*/
XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
{
XTensor base;
XTensor lp;
base = (length + 5)/(1 + 5);
lp = Power(base, alpha);
return lp;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -16,45 +16,33 @@
*/
/*
* backward computation for data operation
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/
#include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h"
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#ifndef __XBACKWARDDATA_H__
#define __XBACKWARDDATA_H__
#include "../../tensor/XTensor.h"
namespace nts{
using namespace nts;
/* this class computes the gradient for tensor data operation given a node */
class XDataGrad
namespace transformer
{
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficent);
/* indicates whether the node is for a shaping operation */
static
bool IsDataOP(XTensor * node);
private:
/* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
static
void GradConvertDataType(XTensor * node, bool isEfficent);
/* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
static
void GradIndexToOnehot(XTensor * node, bool isEfficent);
/* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static
void GradOnehotToIndex(XTensor * node, bool isEfficent);
XTensor GNMT(const XTensor & length, float alpha);
};
} // namespace nts(NiuTrans.Tensor)
}
#endif
\ No newline at end of file
#endif
......@@ -23,6 +23,7 @@
#include "T2TModel.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/XUtility.h"
namespace transformer
{
......@@ -44,10 +45,13 @@ T2TModel::T2TModel()
/* de-constructor */
T2TModel::~T2TModel()
{
delete mem;
delete encoder;
delete decoder;
delete outputLayer;
/* we delete "mem" at the end because other members are using it and we must
remove the memory space before all tensors are destroyed. */
delete mem;
}
/*
......@@ -81,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv)
if(isMT)
decoder->InitModel(argc, argv, true, 0, devID, mem);
XList params(10);
TensorList params(10);
GetParams(params);
for(int i = 0; i < params.count; i++){
......@@ -359,12 +363,17 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
/* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
//maskDec.Dump(stderr, "mask: ");
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
......@@ -374,8 +383,17 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//paddingEnc.Dump(stderr, "paddingenc:");
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
......@@ -385,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
get parameter matrics
>> list - the list that keeps the parameter matrics
*/
void T2TModel::GetParams(XList &list)
void T2TModel::GetParams(TensorList &list)
{
list.Clear();
list.Add(&outputLayer->w);
......@@ -442,10 +460,12 @@ dump the parameters
*/
void T2TModel::Dump(const char * fn)
{
double startT = GetClockSec();
FILE * file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
TensorList params(100);
GetParams(params);
......@@ -456,16 +476,20 @@ void T2TModel::Dump(const char * fn)
fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n");
double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
}
/* read the parameters */
void T2TModel::Read(const char * fn)
{
double startT = GetClockSec();
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
TensorList params(100);
GetParams(params);
......@@ -476,7 +500,9 @@ void T2TModel::Read(const char * fn)
fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n");
double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
}
}
......@@ -98,7 +98,7 @@ public:
XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */
void GetParams(XList &list);
void GetParams(TensorList &list);
/* dump the parameters */
void Dump(const char * fn);
......
......@@ -95,6 +95,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
//output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
output.SetName(OUTPUT_NAME);
}
}
......@@ -28,6 +28,8 @@ using namespace nts;
namespace transformer
{
#define OUTPUT_NAME "output"
/* output layer */
class T2TOutput
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
*/
#include "T2TPredictor.h"
#include "../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TStateBundle::T2TStateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
{
if(states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if(states != NULL)
delete[] states;
states = new T2TState[num];
for(int i = 0; i < num; i++){
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
T2TPredictor::T2TPredictor()
{
startSymbol = -1;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
{
}
/*
create an initial state
>> model - the t2t model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
{
state->layersEnc.Clear();
state->layersDec.Clear();
XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
CheckNTErrors(encoding != NULL, "No encoding layers found!");
state->layersEnc.Add(encoding);
state->layersDec.Add(NULL);
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->GetDim(i);
dims[input->order - 1] = beamSize;
InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
state->probPath.SetZeroAll();
state->nstep.SetZeroAll();
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probablities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states (assuming that the current state has been read)
>> encoding - encoder output
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder
*/
void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor * inputEnc, XTensor * paddingEnc)
{
int dims[MAX_TENSOR_DIM_NUM];
next->layersEnc.Clear();
next->layersDec.Clear();
AttDecoder &decoder = *m->decoder;
/* word indices of previous positions */
XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
for(int i = 0; i < inputEnc->order - 1; i++)
dims[i] = inputEnc->GetDim(i);
dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
_SetDataFixedInt(&first, startSymbol);
/* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) {
inputDec = Identity(first);
}
else{
inputDec = GeneratePaths(s);
inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
inputDec = Concatenate(first, inputDec, inputDec.order - 1);
}
/* prediction probabilities */
XTensor &output = next->prob;
XTensor decoding;
XTensor decodingStep;
for(int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.GetDim(i);
dims[inputDec.order - 1] = inputDec.GetDim(-1);
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
SetDataFixedInt(paddingDec, 1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
XTensor selectSrc;
XTensor selectTgt;
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
int stride = decoding.GetDim(decoding.order - 2);
InitTensor1D(&selectSrc, 1, X_INT);
InitTensor1D(&selectTgt, 1, X_INT);
selectSrc.SetInt(stride - 1, 0);
selectTgt.SetInt(0, 0);
selectSrc.SetDevice(decoding.devID, decoding.mem);
selectTgt.SetDevice(decoding.devID, decoding.mem);
/* the decoder output of the last position */
decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
/* generate the output probabilities */
m->outputLayer->Make(decodingStep, output);
_LogMe(&output);
next->layersEnc.AddList(&s->layersEnc);
next->layersDec.Add(&inputDec);
next->layersDec.Add(&output);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for(int i = 0; i < state->stateNum; i++){
T2TState * cur = state->states + i;
int nsteps = 0;
while(cur != NULL){
nsteps++;
cur = cur->last;
}
if(nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2D(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for(int i = 0; i < state->stateNum; i++){
T2TState * cur = state->states + i;
int nsteps = 0;
while(cur != NULL){
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer
{
#define T2T_PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypothsis in translation. */
class T2TState
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypothesis
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState * last;
};
/* a bundle of states */
class T2TStateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypothesis is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypothesis */
XTensor nstep;
/* layers on the encoder side. We actually use the encoder output instead
of all hidden layers. */
TensorList layersEnc;
/* layers on the decoder side */
TensorList layersDec;
/* list of states */
T2TState * states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
T2TStateBundle();
/* de-constructor */
~T2TStateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regared as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
{
private:
/* pointer to the transformer model */
T2TModel * m;
/* current state */
T2TStateBundle * s;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TPredictor();
/* de-constructor */
~T2TPredictor();
/* create an initial state */
void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel * model, T2TStateBundle * state);
/* predict the next state */
void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle * state);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#include "T2TModel.h"
#include "T2TPredictor.h"
namespace transformer
{
/* The class orgnizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path respresents a translation hypothsis.
The output can be the path with the highest model score. */
class T2TSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float> * fullHypos;
/* array of the end symbols */
int * endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TSearch();
/* de-constructor */
~T2TSearch();
/* initialize the model */
void Init(int argc, char ** argv);
/* search for the most promising states */
void Search(T2TModel * model, XTensor * input, XTensor * padding,
XTensor * output, XTensor * score);
/* preparation */
void Prepare(int myBatchSize,int myBeamSize);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle * prev, T2TStateBundle * beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle * beam);
/* expand the search graph */
void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle * beam);
/* fill the hypotheis heap with incomplete hypothses */
void FillHeap(T2TStateBundle * beam);
/* save the output sequences in a tensor */
void Dump(XTensor * output, XTensor * score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* set end symbols for search */
void SetEnd(const int * tokens, const int tokenNum);
/* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle * beam);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle * beam);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include <math.h>
#include "T2TUtility.h"
#include "T2TTester.h"
#include "T2TSearch.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TTester::T2TTester()
{
}
/* de-constructor */
T2TTester::~T2TTester()
{
}
/* initialize the model */
void T2TTester::Init(int argc, char ** argv)
{
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
batchLoader.Init(argc, argv);
seacher.Init(argc, argv);
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
{
int wc = 0;
int ws = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
float loss = 0;
/* data files */
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot read the test file");
FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
double startT = GetClockSec();
wordCount = 0;
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
/* label */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int * seqs = new int[MILLION];
batchLoader.SetRandomBatch(false);
batchLoader.ClearBuf();
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, mem, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
CheckNTErrors(!model->isLM, "Only MT model is supported!");
XTensor output;
XTensor score;
seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
Dump(ofile, &output);
float prob = 0;
loss += -prob;
wc = batchEnc.GetDim(-1);
wordCount += wc;
wordCountTotal += wc;
sentCount += batchEnc.GetDim(-2);
batchCount += 1;
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr,
"[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
elapsed, sentCount, wordCount);
}
}
fclose(file);
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTester::Dump(FILE * file, XTensor * output)
{
int seqLength = output->GetDim(-1);
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
fprintf(file, "%d ", w);
if (w < 0)
break;
}
fprintf(file, "\n");
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* A week with no trips :)
*/
#ifndef __T2TTESTER_H__
#define __T2TTESTER_H__
#include "T2TSearch.h"
#include "T2TBatchLoader.h"
namespace transformer
{
/* This class translates test sentences with a trained model. */
class T2TTester
{
public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* for batching */
T2TBatchLoader batchLoader;
/* decoder for inference */
T2TSearch seacher;
public:
/* constructor */
T2TTester();
/* de-constructor */
~T2TTester();
/* initialize the model */
void Init(int argc, char ** argv);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* dump the result into the file */
void Dump(FILE * file, XTensor * output);
};
}
#endif
\ No newline at end of file
......@@ -23,35 +23,14 @@
#define __T2TTRAINER_H__
#include "T2TModel.h"
#include "T2TBatchLoader.h"
#include "../../tensor/function/FHeader.h"
#define MAX_SEQUENCE_LENGTH 1024 * 4
using namespace nts;
namespace transformer
{
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */
class T2TTrainer
{
......@@ -61,42 +40,6 @@ public:
/* parameter array */
char ** argArray;
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* dimension size of each inner layer */
int d;
......@@ -139,10 +82,10 @@ public:
float adamBeta2T;
/* list of the moment of the parameter matrics */
XList moments;
TensorList moments;
/* list of the 2nd order moment of the parameter matrics */
XList moments2nd;
TensorList moments2nd;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
......@@ -158,26 +101,15 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* bucket size */
int bucketSize;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* for batching */
T2TBatchLoader batchLoader;
public:
/* constructor */
......@@ -197,46 +129,6 @@ public:
/* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* load data to buffer */
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
......
......@@ -25,6 +25,8 @@
#include "T2TModel.h"
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "T2TPredictor.h"
#include "T2TTester.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
......@@ -47,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)
ShowParams(argc, args);
bool isBeamSearch = false;
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
......@@ -56,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "model", modelFN, "");
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
srand((unsigned int)time(NULL));
......@@ -64,28 +68,35 @@ int TransformerMain(int argc, const char ** argv)
T2TModel model;
model.InitModel(argc, args);
//if(strcmp(modelFN, ""))
// model.Read(modelFN);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */
//if(strcmp(modelFN, "") && strcmp(trainFN, ""))
//model.Dump(modelFN);
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
/* load the model if neccessary */
//if(strcmp(modelFN, ""))
//model.Read(modelFN);
T2TTrainer tester;
tester.Init(argc, args);
if(strcmp(modelFN, ""))
model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
tester.Test(testFN, outputFN, &model);
if(strcmp(testFN, "") && strcmp(outputFN, "")){
/* beam search */
if(isBeamSearch){
T2TTester searcher;
searcher.Init(argc, args);
searcher.Test(testFN, outputFN, &model);
}
/* forced decoding */
else{
T2TTrainer tester;
tester.Init(argc, args);
tester.Test(testFN, outputFN, &model);
}
}
delete[] trainFN;
delete[] modelFN;
......
......@@ -201,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
cudaError_t error = cudaSetDevice(devID);
if (error != cudaSuccess){
fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
devID, error, cudaGetErrorString(error));
exit(1);
}
#else
......@@ -216,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
SetFastFlags();
}
/* switch to a get current dev */
/* get the id of the current GPU device */
int XDevice::GetGPUDevice()
{
#ifdef USE_CUDA
......@@ -224,7 +225,8 @@ int XDevice::GetGPUDevice()
cudaError_t error = cudaGetDevice(&devID);
if (error != cudaSuccess){
fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
devID, error, cudaGetErrorString(error));
exit(1);
}
......@@ -248,7 +250,7 @@ void XDevice::SetFastFlags()
#endif
}
/* reset cuda flag for more efficient cuda execution (all devices) */
/* reset the cuda flag for more efficient cuda execution (all devices) */
void XDevice::SetFastFlagsAllDevices()
{
#ifdef USE_CUDA
......@@ -266,10 +268,6 @@ XDevManager::XDevManager()
{
Clear();
Init();
#ifndef USE_CPP11
fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
#endif
}
/* de-constructor */
......@@ -278,7 +276,7 @@ XDevManager::~XDevManager()
}
/* initialize it and get the CPU and GPU information */
/* initialization */
void XDevManager::Init()
{
srand((unsigned int)time(NULL));
......@@ -322,7 +320,7 @@ void XDevManager::Clear()
#ifdef USE_CUDA
/* get the handle of GPU */
/* get the handle of a given GPU */
cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
{
CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
......@@ -330,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return GPUs[devID].GetCublasHandle();
}
/* get the stream of cuda */
/* get the stream of a given GPU */
cudaStream_t * XDevManager::GetCudaStream(const int devID)
{
CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
......@@ -478,7 +476,7 @@ split a string
>> items - splitting result
<< return - how many items are there
*/
int SplitALine(char * inputString, const char * seperator, XList * items)
int SplitALine(char * inputString, const char * seperator, StrList* items)
{
items->Clear();
......@@ -527,12 +525,12 @@ get device ids for the given device information
devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
means that the first device is CPU, the second device
is GPU-0, the third device is CPU.
>> devIDs - device sequence specified by devInfo
>> devIDs - device IDs specified by devInfo
<< return - number of devices
*/
int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
{
XList * terms = new XList(1);
StrList* terms = new StrList(1);
SplitALine(devInfo, " ", terms);
for(int i = 0; i < terms->count; i++){
......@@ -569,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
return devCount;
}
/* show id sequence */
/* show device IDs */
void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
{
msg[0] = 0;
......
......@@ -236,6 +236,18 @@ extern XDevManager GDevs;
cudaSetDevice(devIDBackup); \
} \
#define CheckDev(a, b) \
{ \
if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
exit(1); \
} \
else if (a >= 0 && b >= 0 && a != b) { \
fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
exit(1); \
} \
} \
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
......@@ -32,8 +32,6 @@
#ifndef WIN32
#include <sys/time.h>
#include <unistd.h>
#include <stdint.h>
typedef int8_t __int8;
#endif
// the CUDA stuff
......@@ -45,10 +43,6 @@ typedef int8_t __int8;
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
#define USE_CPP11
#endif
#define _XINLINE_
//#define DOUBELPRICSION
......@@ -159,7 +153,9 @@ extern bool useCUDA;
#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
#define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true
#define B2I(V) V == 0 ? false : true
#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)
/* BLAS interfaces */
#ifdef DOUBELPRICSION
......
......@@ -31,15 +31,15 @@ namespace nts{
/* constructor */
template<HeapType hType, typename T>
XHeap<hType, T>::XHeap()
{
}
/* constructor */
template<HeapType hType, typename T>
XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
{
mem = myMem;
size = mySize;
count = 0;
if (mem == NULL)
items = new HeapNode<T>[mySize];
else
mem->Alloc(mem->devID, mySize * sizeof(T));
Init(mySize, myMem);
}
/* deconstructor */
......@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
}
template<HeapType hType, typename T>
void XHeap<hType, T>::Init(int mySize, XMem * myMem)
{
mem = myMem;
size = mySize;
count = 0;
if (mem == NULL)
items = new HeapNode<T>[mySize];
else
mem->Alloc(mem->devID, mySize * sizeof(T));
}
template<HeapType hType, typename T>
void XHeap<hType, T>::Clear(T initValue)
{
count = 0;
......@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
{
//CheckNTErrors((count < size), "Heap is full!");
items[count] = node;
Up(count);
count++;
if (count < size) {
items[count] = node;
Up(count);
count++;
}
else if(count == size){
HeapNode<T> & item0 = items[0];
if (hType == MIN_HEAP && item0.value >= node.value)
return;
else if (hType == MAX_HEAP && item0.value <= node.value)
return;
items[0] = node;
Down(0);
}
else {
ShowNTErrors("Overflow of the heap!");
}
}
/* replace the top-most item and update the heap */
......@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
template<HeapType hType, typename T>
_XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
{
//CheckNTErrors((size > 0), "Empty heap!");
CheckNTErrors(count > 0, "Empty heap!");
HeapNode<T> node = items[0];
items[0] = items[count - 1];
count--;
......
......@@ -39,7 +39,7 @@ template <typename T>
struct HeapNode
{
/* node index */
int index;
long long index;
/* value of the node */
T value;
......@@ -52,9 +52,16 @@ struct HeapNode
HeapNode(int i, T v)
{
index = i;
index = (long long)i;
value = v;
};
HeapNode(void * i, T v)
{
index = (long long)i;
value = v;
}
};
/* a heap that keeps a data array of T */
......@@ -76,11 +83,17 @@ public:
public:
/* constructor */
XHeap();
/* constructor */
XHeap(int mySize, XMem * myMem = NULL);
/* deconstructor */
~XHeap();
/* initialization */
void Init(int mySize, XMem * myMem = NULL);
/* clear the data */
void Clear(T initValue);
......@@ -107,6 +120,9 @@ public:
/* move item k up the tree */
void Up(int k);
/* how many items are kept in the heap */
inline int Count() { return count; };
};
} /* end of the nts (NiuTrans.Tensor) namespace */
......
......@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
if(h == NULL)
return;
XList list(2);
list.Add(t1);
list.Add(t2);
TensorList list(2);
list.Add((XTensor*)t1);
list.Add((XTensor*)t2);
MakeLink(&list, h, id);
}
......@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
if (h == NULL)
return;
XList list(3);
list.Add(t1);
list.Add(t2);
list.Add(t3);
TensorList list(3);
list.Add((XTensor*)t1);
list.Add((XTensor*)t2);
list.Add((XTensor*)t3);
MakeLink(&list, h, id);
}
......@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
>> h - head tensor
>> id - id of the edge type
*/
void XLink::MakeLink(const XList * list, XTensor * h, int id)
void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
{
/* forward */
XLink &income = h->income;
......@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
>> list - a list of output tensors
>> id - id of the edge type
*/
void XLink::MakeLink(XTensor * t, XList * list, int id)
void XLink::MakeLink(XTensor * t, TensorList * list, int id)
{
/* forward */
for(int i = 0; i < list->count; i++){
......@@ -624,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
ClearIncoming(target);
int tailNum = reference->income.tailNum;
XList tails(tailNum);
TensorList tails(tailNum);
for(int i = 0; i < tailNum; i++){
XTensor * tail = (XTensor*)reference->income.tails[i];
tails.Add(tail);
......@@ -743,7 +743,7 @@ search for a node in a top-down manner by its name
>> top - the top most node
<< return - the node we found
*/
/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
XTensor * XLink::SearchNode(XTensor * top, const char * name)
{
if(!strcmp(top->name, name))
return top;
......@@ -758,7 +758,7 @@ search for a node in a top-down manner by its name
}
return NULL;
}*/
}
} // namespace nts(NiuTrans.Tensor)
......
......@@ -138,17 +138,17 @@ struct XLink
static
void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
/* create a hyper edge with two input tensors and a output tensor */
/* create a hyper edge with three input tensors and a output tensor */
static
void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
/* create a hyper edge with a list of input tensors and a output tensor */
static
void MakeLink(const XList * list, XTensor * h, int id);
void MakeLink(const TensorList * list, XTensor * h, int id);
/* create a hyper edge with a input tensors and a list of output tensors */
static
void MakeLink(XTensor * h, XList * list, int id);
void MakeLink(XTensor * h, TensorList * list, int id);
/* add a parameter */
static
......@@ -191,8 +191,8 @@ struct XLink
void ShowNode(FILE * file, XTensor * node);
/* search a node in a top-down manner by its name */
//static
//XTensor * SearchNode(XTensor * top, const char * name);
static
XTensor * SearchNode(XTensor * top, const char * name);
};
} // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -15,32 +15,31 @@
* limitations under the License.
*/
/*
*
* Implementation of list that keeps data items
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
* The first coding job this year!
*
*/
#ifndef __XLIST_H__
#define __XLIST_H__
/*
*
* Implementation of template list that keeps data items
*
* $Created by: HU Chi (huchinlp@foxmail.com)
*
*/
#include "XMem.h"
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#ifndef __TensorList_H__
#define __TensorList_H__
typedef int (* ListCompare)(const void * item1, const void * item2);
/* the XList class */
class XList
{
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* the TensorListBase class */
template <typename T>
struct TensorListBase {
public:
/* data items */
void ** items;
T *items;
/* number of items */
int count;
......@@ -49,56 +48,88 @@ public:
int maxNum;
/* the memory pool for data array allocation */
XMem * mem;
/* indicates whether data items are integers */
bool isIntList;
XMem* mem;
public:
/* constructor */
XList();
TensorListBase();
/* constructor */
XList(int myMaxNum, bool isIntListOrNot = false);
TensorListBase(int myMaxNum);
/* constructor */
XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
TensorListBase(int myMaxNum, XMem* myMem);
/* de-constructor */
~XList();
/* utilities */
void Create(int myMaxNum, XMem * myMem);
void Add(const void * item);
void Add(void ** inputItems, int inputItemCount);
void AddList(XList * l);
void AddInt(int i);
void Insert(int pos, void * item);
void * GetItem(int i) const;
int GetItemInt(int i);
void SetItem(int i, void * item);
void SetItemInt(int i, int item);
int FindFirst(void * item);
~TensorListBase();
/* add an item into the list */
void Add(T&& item);
/* add an item into the list */
void Add(const T& item);
/* add a number of items into the list */
void Add(T* inputItems, int inputItemCount);
/* append a list to the current list */
void AddList(TensorListBase* l);
/* insert an item to the given position of the list */
void Insert(int pos, const T& item);
/* insert an item to the given position of the list */
void Insert(int pos, T&& item);
/* get the item at position i */
T& GetItem(int i) const;
/* set the item at position i */
void SetItem(int i, const T& item);
/* set the item at position i */
void SetItem(int i, T&& item);
/* find the position of the first matched item */
int FindFirst(const T& item);
/* clear the data array */
void Clear();
void ClearStringList();
void Sort(int itemSize, ListCompare comp);
/* sort the list */
void Sort(int itemSize);
/* reverse the list */
void Reverse();
/* remove the item at position i */
void Remove(int i);
XList * Copy(XMem * myMem);
/* copy the list */
TensorListBase* Copy(XMem* myMem);
/* shuffle the list */
void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */
_XINLINE_ void * Get(int i) {return GetItem(i);};
_XINLINE_ int GetInt(int i) {return GetItemInt(i);};
_XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
_XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
T& operator[] (int i) {
return GetItem(i);
};
T& Get(int i) { return GetItem(i); };
void Set(int i, T item) { SetItem(i, item); };
};
extern XList NULLList;
struct XTensor;
typedef TensorListBase<int> IntList;
typedef TensorListBase<char> CharList;
typedef TensorListBase<char*> StrList;
typedef TensorListBase<long> LongList;
typedef TensorListBase<float> FloatList;
typedef TensorListBase<short> ShortList;
typedef TensorListBase<void*> XList;
typedef TensorListBase<XTensor*> TensorList;
}
/* end of the nts (NiuTrans.Tensor) namespace */
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
#endif // __TensorList_H__
......@@ -34,6 +34,11 @@ namespace nts{
int testxmemid = 0;
void * recordp = NULL;
/*
for managing the memories
*/
XMemManager GMems;
XMem * GMem;
/* constructor */
......@@ -48,6 +53,7 @@ XMem::XMem()
strcpy(name, "xmem");
signature = 0;
mergeFreeOTF = true;
isInitialized = false;
}
/*
......@@ -58,7 +64,7 @@ constructor
>> myMode - mode of running the memory pool
UNI_FREE: free all the space at the end of using the memory pool
FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of memory block
>> myBlockSize - size of a memory block
>> myBlockNum - number of memory blocks
>> myBufSize - size of buffer
*/
......@@ -103,7 +109,7 @@ initialize it
>> myMode - mode of running the memory pool
UNI_FREE: free all the space at the end of using the memory pool
FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of memory block
>> myBlockSize - size of a memory block
>> myBlockNum - number of memory blocks
>> myBufSize - size of buffer
*/
......@@ -164,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
#endif
signature++;
isInitialized = true;
}
/* free memory */
......@@ -216,9 +223,9 @@ void XMem::Free(int myDevID, void * mem)
}
}
/*
get signature
<< return - return the signature
/*
get the signature
<< return - the signature
*/
MTYPE XMem::GetSignature()
{
......@@ -226,7 +233,7 @@ MTYPE XMem::GetSignature()
}
/*
use string as the name of the memory pool
set the name of the memory pool
>> myName - name of the memory pool
*/
void XMem::SetName(const char * myName)
......@@ -259,7 +266,7 @@ void XMem::SetDevice(int myDevID)
}
/*
switch to the device (with fast cuda execution mode) we want to work
switch to the device (with fast cuda execution mode) we intend to work on
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
*/
void XMem::SetDeviceFast(int myDevID)
......@@ -275,7 +282,7 @@ void XMem::SetDeviceFast(int myDevID)
}
/*
run in static mode
run in the static mode
>> myIsStatic - specify if the memory allocation is static
*/
void XMem::SetStaticMode(bool myIsStatic)
......@@ -1488,4 +1495,179 @@ cublasHandle_t * XMem::GetCublasHandle()
#endif
/* constructor */
XMemManager::XMemManager()
{
Initialize();
}
/* de-constructor */
XMemManager::~XMemManager()
{
}
/* get memory size */
MTYPE XMemManager::GetAvailableMemory()
{
unsigned long freeMem = 0;
#if __APPLE__
int mib[2] = {CTL_HW, HW_MEMSIZE};
unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
unsigned long long size;
size_t len = sizeof(size);
if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
ShowNTErrors("Cannot get memory size on Mac!");
}
else{
return size;
}
#elif _WIN32
MEMORYSTATUSEX memoryStatus;
memoryStatus.dwLength = sizeof(memoryStatus);
if (GlobalMemoryStatusEx(&memoryStatus)){
freeMem = memoryStatus.ullAvailPhys;
}
#else
long pages = sysconf(_SC_AVPHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
freeMem = pages * page_size;
#endif
return (MTYPE)freeMem;
}
/* get GPU memory size */
MTYPE XMemManager::GetAvailableGPUMemory(int devID)
{
size_t freeMem = 0;
#ifdef USE_CUDA
size_t totalMem = 0;
cudaSetDevice(devID);
if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
XPRINT(0, stderr, "cannot get GPU memory information.");
exit(1);
}
#endif
return (MTYPE)freeMem;
}
/* get buffer size */
void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
{
*myBufSize = 0;
if (freeMem >= MILLION * 128){
*myBufSize = MILLION * 32;
if (freeMem >= MILLION * 256){
*myBufSize = MILLION * 64;
if (freeMem >= MILLION * 512){
*myBufSize = MILLION * 128;
if (freeMem >= MILLION * 1024) {
*myBufSize = MILLION * 256;
if (freeMem >= MILLION * 2048)
*myBufSize = MILLION * 512;
}
}
}
}
}
/* initialize it and set the global memory information */
void XMemManager::Initialize()
{
srand((unsigned int)time(NULL));
Free();
/* CPUs (we actually do not care about how many CPUs are using) */
nCPUMem = 1;
/* GPUs */
nGPUMem = 0;
#ifdef USE_CUDA
if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
XPRINT(0, stderr, "cannot get GPU information.");
exit(1);
}
#endif
}
/* free it */
void XMemManager::Free()
{
for (int i = 0; i < MAX_CPU_NUM; i++)
CPUMems[i].Free();
for (int i = 0; i < MAX_GPU_NUM; i++)
GPUMems[i].Free();
}
/* get global memory pool */
XMem * XMemManager::GetMem(const int devID)
{
XMem * mem = NULL;
if (devID < 0){
if(!CPUMems[0].isInitialized){
MTYPE freeMem = GetAvailableMemory();
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
CPUMems[0].Initialize(-1, FREE_ON_THE_FLY,
MIN_BLOCK_SIZE_FOR_MEMPOOL,
MIN_BLOCK_NUM_FOR_MEMPOOL,
myBufSize);
}
mem = CPUMems;
}
else{
if (devID < nGPUMem){
if(!GPUMems[devID].isInitialized){
MTYPE freeMem = GetAvailableGPUMemory(devID);
MTYPE myBufSize = 0;
GetBufferSize(freeMem, &myBufSize);
GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY,
MIN_BLOCK_SIZE_FOR_MEMPOOL,
MIN_BLOCK_NUM_FOR_MEMPOOL,
myBufSize);
}
mem = GPUMems + devID;
}
else{
XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
}
}
return mem;
}
/* get global memory size */
int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
{
XMem * mem = GetMem(devID);
int result = 0;
if (mem != NULL){
*myBlockSize = mem->maxBlockSize;
*myBlockNum = mem->blockNum;
*myBufSize = mem->bufSize;
result = 1;
}
return result;
}
/* show memory information */
void XMemManager::ShowMemInfo()
{
XPRINT(1, stderr, "Memory Information:\n");
MTYPE myBlockSize, myBufSize;
int myBlockNum;
for(int i = 0; i < nCPUMem; i++){
GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
}
for(int i = 0; i < nGPUMem; i++){
GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -39,6 +39,15 @@
#include <curand.h>
#endif
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#elif WIN32
#include <windows.h>
#else
#include <unistd.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
......@@ -53,6 +62,8 @@ typedef long long INT_64;
#define BUF_PITCH 256
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
#define MAX_CPU_NUM 16
#define MAX_GPU_NUM 16
/*
mode of runnig a memory pool
......@@ -202,6 +213,9 @@ public:
MTYPE curUsedPin;
MTYPE bufUsedPin;
/* indicates whether the memory pool is initialized */
bool isInitialized;
#ifdef USE_CUDA
/* handle used for cublas */
cublasHandle_t cublasHandle;
......@@ -413,6 +427,61 @@ public:
};
/*
a class for the management of memory
*/
class XMemManager
{
private:
/* cpu memory pool information */
XMem CPUMems[MAX_CPU_NUM];
/* number of cpu memory pools */
int nCPUMem;
/* gpu memory pool information */
XMem GPUMems[MAX_GPU_NUM];
/* number of gpu memory pools */
int nGPUMem;
public:
/* constructor */
XMemManager();
/* de-constructor */
~XMemManager();
/* get memory size */
MTYPE GetAvailableMemory();
/* get GPU memory size */
MTYPE GetAvailableGPUMemory(int devID);
/* get buffer size */
void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
/* initialize it and set the global memory information */
void Initialize();
/* free it */
void Free();
/* get global memory pool */
XMem * GetMem(const int devID);
/* get global memory size */
int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
/* show memory information */
void ShowMemInfo();
};
/* managing the memories */
extern XMemManager GMems;
extern XMem * GMem;
extern int testxmemid;
......
......@@ -108,18 +108,10 @@ const char * GetOPName(int type)
else if (type == REDUCE_REDUCEVARIANCE)
return "R_REDUCEVARIANCE";
}
else if ((type & DATA_BASE) != 0) {
if (type == GETANDSET_CONVERTDATATYPE)
return "G_CONVERTDATATYPE";
else if (type == GETANDSET_INDEXTOONEHOT)
return "G_INDEXTOONEHOT";
else if (type == GETANDSET_ONEHOTTOINDEX)
return "G_ONEHOTTOINDEX";
else if (type == GETANDSET_SELECT)
return "G_SELECT";
}
else if ((type & SHAPE_BASE) != 0) {
if (type == MOVEMENT_COPYINDEXED)
else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_SELECT)
return "G_SELECT";
else if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
......
......@@ -79,13 +79,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* data and shape related operations */
#define DATA_BASE MATH_BASE * 2
#define GETANDSET DATA_BASE + 1
#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
#define GETANDSET_SELECT GETANDSET_ONEHOTTOINDEX + 1
#define GETANDSET_SELECT GETANDSET + 1
#define SHAPE_BASE DATA_BASE * 2
#define MOVEMENT SHAPE_BASE + 1
#define MOVEMENT GETANDSET_SELECT + 1
#define MOVEMENT_COPYINDEXED MOVEMENT + 1
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
......@@ -108,7 +104,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define SORT_TOPK SORT_SORT + 1
/* activation functions */
#define FUNCTION_BASE SHAPE_BASE * 2
#define FUNCTION_BASE DATA_BASE * 2
#define FUNC_DROPOUT FUNCTION_BASE + 1
#define FUNC_HARDTANH FUNC_DROPOUT + 1
#define FUNC_IDENTITY FUNC_HARDTANH + 1
......
......@@ -146,7 +146,7 @@ run a set of jobs in parallel
>> jobArgs - the list of arguments for each job
>> sleepTime - time to sleep (in ms) for each round
*/
void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
{
if(threadNum <= 0){
XPRINT(1, stderr, "Error! No threads were created!\n");
......@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
/* the arguments that are passed to the function */
volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
/* thread */
XThread * thread = threads + availableThreads[i];
......
......@@ -106,7 +106,7 @@ public:
void KillThreads();
/* run a set of jobs in parallel */
void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
/* get the number of parallel jobs to run */
int GetJobNum(int size);
......
......@@ -42,7 +42,7 @@ job item used in queues
JobQueueNode::JobQueueNode()
{
job = NULL;
args = new XList(1);
args = new TensorList(1);
}
/* de-constructor */
......@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
head = 0;
tail = 0;
isJobQueue = false;
jobDequeuerArgs = new XList(1);
jobDequeuerArgs = new TensorList(1);
jobDequeuerBreak = false;
runningJobCount = 0;
jobStream = NULL;
......@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true;
jobDequeuerArgs->Clear();
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
// warning: this may cause unknown error
jobDequeuerArgs->Add((XTensor*)this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs;
......@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
}
/* add a job item to process */
void XQueue::EnqueueJob(void * job, XList * jobArgs)
void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
{
MUTEX_LOCK(jobQueueMutex);
runningJobCount++;
......@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
}
/* job item consumer */
void XQueue::DequeueJobs(XList * args)
void XQueue::DequeueJobs(TensorList * args)
{
CheckNTErrors((args->count == 2), "Illegal arguments!");
......
......@@ -52,7 +52,7 @@ public:
void * job;
/* arguments of the job */
XList * args;
TensorList * args;
public:
/* constructor */
......@@ -102,7 +102,7 @@ private:
XThread jobDequeuer;
/* argument list of jobDequeuer */
XList * jobDequeuerArgs;
TensorList * jobDequeuerArgs;
/* indicates whether jobDequeuer stops */
bool jobDequeuerBreak;
......@@ -141,11 +141,11 @@ public:
void StopJobConsumer();
/* add a job item to process */
void EnqueueJob(void * job, XList * jobArgs);
void EnqueueJob(void * job, TensorList * jobArgs);
/* job item consumer */
static
void DequeueJobs(XList * args);
void DequeueJobs(TensorList * args);
/* get the break flag */
bool GetJobBreak();
......
......@@ -52,6 +52,7 @@ struct XLink;
#define MIN_TENSOR_MERGE_NUM 0
#define MIN_TENSOR_MERGE_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8
#define MAX_TENSOR_NAME_SIZE 32
/* computation flags */
#define UNSAFE_BUT_FAST_MEM
......@@ -61,6 +62,9 @@ struct XLink;
struct XTensor
{
public:
/* name */
char name[MAX_TENSOR_NAME_SIZE];
/* id */
int id;
......@@ -190,13 +194,14 @@ public:
XTensor(const XTensor &reference);
/* copy constructor (with right value reference) */
#ifdef USE_CPP11
XTensor(const XTensor &&reference);
#endif
/* de-constructor */
~XTensor();
/* set the name of the tensor */
void SetName(const char * myName);
/* initialize member variables */
void Init();
......@@ -210,9 +215,7 @@ public:
XTensor& operator= (const XTensor &tensor);
/* overloading of the equal-sign (with right value reference) */
#ifdef USE_CPP11
XTensor& operator= (const XTensor &&tensor);
#endif
/* overloading of the plus-sign */
XTensor operator+ (const XTensor &tensor) const;
......@@ -241,6 +244,9 @@ public:
/* linear transformation */
XTensor Lin(DTYPE scale, DTYPE shift = 0) const;
/* relocate the data on the target device */
void SetDevice(int myDevId, XMem * myMem = NULL);
/* judge whether the two matrices are in the same type and size */
static
bool IsSameShaped(const XTensor * a, const XTensor * b);
......@@ -268,6 +274,9 @@ public:
/* reshape the tensor to a matrix */
void Reshape(const int rowNum, const int colNum);
/* reshape the tensor by merging two consecutive dimensions */
void ReshapeMerged(const int i, const int j = -1);
/* get the number of items in the data array */
int GetSize() const;
......@@ -315,6 +324,9 @@ public:
/* get the value of a cell with the index */
DTYPE Get(int index[], int size = -1);
/* get the value of a cell with the offset */
DTYPE Get(int offset);
/* get the pointer to a cell */
void * GetCell(int index[], int size = -1) const;
......@@ -328,6 +340,9 @@ public:
/* get the default type value of a cell in a 3d tensor */
DTYPE Get3D(int d0, int d1, int d2);
/* get the int value of a cell by its offset */
int GetInt(int offset);
/* get the int value of a cell in a 1d tensor */
int Get1DInt(int i);
......@@ -346,6 +361,9 @@ public:
/* set the value of a cell */
bool Set(DTYPE value, int index[], int size = -1);
/* set the value of a cell with its offset in the array */
bool Set(DTYPE value, int offset);
/* set the value of a cell in a 1d tensor */
bool Set1D(DTYPE value, int i);
......@@ -355,6 +373,9 @@ public:
/* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell by its offset */
bool SetInt(int value, int offset);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
......@@ -429,66 +450,134 @@ extern int MakeTensorID();
void InitTensor(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense XTensor V2 */
void InitTensorV2(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* initialize a dense vector */
void InitTensor1D(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense vector V2 */
void InitTensor1DV2(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense matrix */
void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense matrix V2 */
void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 3d tensor */
void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 3d tensor V2 */
void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 4d tensor */
void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 4d tensor V2 */
void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a dense 5d tensor */
void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a dense 5d tensor V2 */
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a tensor with a reference tensor */
void InitTensor(XTensor * tensor, const XTensor * reference);
/* initialize a tensor with a reference tensor */
void InitTensorV2(XTensor * tensor, const XTensor * reference);
/* initialize a tensor on the CPU with a reference tensor */
void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
/* generate a XTensor with no initialization */
XTensor * NewTensor();
/* generate a XTensor */
XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense XTensor V2 */
XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);
/* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const XTensor * reference, int devID);
/* generate a dense vector */
XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1,
XMem * myMem = NULL);
/* generate a dense vector V2 */
XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* generate a dense matrix */
XTensor * NewTensor2D(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense matrix V2 */
XTensor * NewTensor2DV2(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 3d tensor */
XTensor * NewTensor3D(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 3d tensor V2 */
XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 4d tensor */
XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 4d tensor V2 */
XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a dense 5d tensor */
XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, XMem * myMem = NULL);
/* generate a dense 5d tensor V2 */
XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1);
/* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
......
......@@ -85,7 +85,7 @@ namespace nts{
#endif
typedef void (*TFunction) (volatile XList*);
typedef void (*TFunction) (volatile TensorList*);
/*
This is a class that wraps the standard implementation of threading
......@@ -133,7 +133,7 @@ public:
/* arguments (for the function to run) */
volatile
XList * argv;
TensorList * argv;
/* a flag to break */
volatile
......
......@@ -97,4 +97,5 @@
#include "utilities/XMatrixSegment.h"
#include "utilities/FlushToMem.h"
#include "../function/DropoutWithIndex.h"
#endif // __CHEADER_H__
......@@ -21,6 +21,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "Div.h"
#include "Div.cuh"
#include "DivDim.h"
......@@ -41,12 +42,15 @@ where i is the index of the item
*/
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaDiv(a, b, c, alpha, leadingDim);
......
......@@ -19,10 +19,12 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include <math.h>
#include "Div.h"
#include "DivDim.h"
#include "DivDim.cuh"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
......@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(XTensor::IsSameShaped(a, b)){
_Div(a, b, c, alpha);
return;
......@@ -151,6 +157,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Div function */
_DivDim(&a, &b, &c, n, alpha);
......
......@@ -17,7 +17,6 @@
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
*/
#include "DivDim.cuh"
......@@ -169,34 +168,6 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
half alpha1 = __float2half(alpha);
if (stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
else
KernelDivWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
}
else if (stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
else
KernelDivWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -16,10 +16,10 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
#ifndef __MASK_H__
#define __MASK_H__
......@@ -28,7 +28,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
/*
mask entries of a given tensor:
c(i) = a(i) if mask(i) is non-zero
c(i) = alpha if mask(i) = 0
......@@ -36,7 +36,7 @@ where i is the index of the element
*/
void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
/*
/*
mask entries of a given tensor (on site):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
......@@ -44,7 +44,7 @@ where i is the index of the element
*/
void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
/*
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
......@@ -55,4 +55,3 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MASK_H__
......@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors(a && b && c, "Empty input tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Input tensors should have the same data type!");
CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
"Input tensors must have a order >= 2!");
CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
......@@ -106,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cBlockNum *= b->dimSizeRDI[i];
}
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
TensorList * aList = new TensorList(10);
TensorList * bList = new TensorList(10);
TensorList * cList = new TensorList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
......
......@@ -38,17 +38,23 @@ argument5: matrix a
argument6: matrix b
argument7: matrix c (c=a*b*\alpha + c*beta)
*/
void _MatrixMul2DMultiTheading(XList * args)
void _MatrixMul2DMultiTheading(TensorList * args)
{
int x1 = *(int*)args->GetItem(0);
int y1 = *(int*)args->GetItem(1);
int x2 = *(int*)args->GetItem(2);
int y2 = *(int*)args->GetItem(3);
XTensor * a = (XTensor*)args->GetItem(4);
XTensor * b = (XTensor*)args->GetItem(5);
XTensor * c = (XTensor*)args->GetItem(6);
DTYPE alpha = *(DTYPE*)args->GetItem(7);
DTYPE beta = *(DTYPE*)args->GetItem(8);
CheckNTErrors(args->count == 2, "invalid argument number!");
IntList * indexArgs = (IntList*)args->GetItem(0);
TensorList * matrixArgs = (TensorList*)args->GetItem(1);
CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
XTensor * a = matrixArgs->GetItem(0);
XTensor * b = matrixArgs->GetItem(1);
XTensor * c = matrixArgs->GetItem(2);
DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
int x1 = indexArgs->GetItem(0);
int y1 = indexArgs->GetItem(1);
int x2 = indexArgs->GetItem(2);
int y2 = indexArgs->GetItem(3);
#ifdef FAST_MATRIX
int am = a->dimSize[1];
......
......@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/
void _MatrixMul2DMultiTheading(XList * args);
void _MatrixMul2DMultiTheading(TensorList * args);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> alpha - scalar
>> beta - scalar
*/
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
const TensorList * b, MATRIX_TRANS_TYPE transposedB,
TensorList * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
......
......@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
optimized for GPU
*/
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB,
TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/*
matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha
......
......@@ -21,6 +21,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "Multiply.h"
#include "Multiply.cuh"
#include "MultiplyDim.h"
......@@ -41,12 +42,15 @@ where i is the index of the item
*/
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMultiply(a, b, c, alpha, leadingDim);
......
......@@ -19,6 +19,7 @@
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include <math.h>
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
......@@ -42,8 +43,10 @@ i.e., a is multiplied with b by broadcasting
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
......@@ -52,6 +55,8 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(XTensor::IsSameShaped(a, b)){
_Multiply(a, b, c, alpha);
return;
......@@ -151,6 +156,8 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, 0);
......
......@@ -44,6 +44,8 @@ void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
......
......@@ -19,10 +19,12 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include <math.h>
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -42,6 +44,8 @@ i.e., a is subtracted with b by broadcasting
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
......@@ -50,6 +54,8 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if (beta == 0) {
_CopyValues(a, c);
return;
......@@ -152,6 +158,8 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
......
......@@ -45,6 +45,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if(beta == 0){
_CopyValues(a, c);
return;
......
......@@ -21,6 +21,7 @@
* Add summation by broadcasting.
*/
#include <math.h>
#include "Sum.h"
#include "SumDim.h"
#include "SumDim.cuh"
......@@ -46,6 +47,8 @@ i.e., a is summed with b by broadcasting
*/
void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
......@@ -54,6 +57,8 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if(beta == 0){
_CopyValues(a, c);
return;
......@@ -169,6 +174,8 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
......
......@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const TensorList * a, MATRIX_TRANS_TYPE transposedA,
const TensorList * b, MATRIX_TRANS_TYPE transposedB, TensorList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
#endif
......
......@@ -20,21 +20,20 @@
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "ConvertDataType.h"
#include "ConvertDataType.cuh"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
convert data type
>> input - input tensor
>> output - output tensor
*/
void _ConvertDataType(const XTensor * input, XTensor * output)
{
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType)
return;
......@@ -62,35 +61,4 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
ShowNTErrors("Unsupported data types for conversion!");
}
/*
convert data type (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
<< return - output tensor with the specified data type
*/
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
{
if (input.dataType == dataType) {
XTensor output;
output = CopyValues(input);
return output;
}
int order = input.order;
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
output.SetTMPFlag();
_ConvertDataType(&input, &output);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
return output;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -67,49 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
if (i < size){
outputData[i] = (float)(inputData[i]);
}
}
__global__
void KernelFloatToInt8(float * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (float)(inputData[i]);
}
}
__global__
void KernelIntToInt8(int * inputData, __int8 * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (__int8)(inputData[i]);
}
}
__global__
void KernelInt8ToInt(__int8 * inputData, int * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (int)(inputData[i]);
}
}
}}
/*
data conversion (cuda code)
......@@ -180,14 +138,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
KernelInt8ToFloat << <blocks, threads >> >((__int8*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_INT && output->dataType == X_INT8)
KernelIntToInt8 << <blocks, threads >> >((int*)input->data, (__int8*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_INT)
KernelInt8ToInt << <blocks, threads >> >((__int8*)input->data, (int*)output->data, input->unitNum);
else{
ShowNTErrors("Unsupported data types for conversion!");
}
......
......@@ -23,16 +23,12 @@
#define __CONVERTDATATYPE_H__
#include "../../XTensor.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* convert data type */
void _ConvertDataType(const XTensor * input, XTensor * output);
/* convert data type (return an XTensor structure) */
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
} // namespace nts(NiuTrans.Tensor)
#endif // __CONVERTDATATYPE_H__
......@@ -21,7 +21,6 @@
#include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh"
#include "SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -32,65 +31,43 @@ convert onehot tensor to index tensor
>> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor
*/
void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim)
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
{
dim = (dim < 0 ? onehot->GetDim(-1) : dim);
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++) {
if (i < dim) {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
}
else {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
}
}
for (int i = 0; i < index->order; i++)
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, dim);
_CudaOnehotToIndex(onehot, index, size);
return;
}
#endif
int blockNum = 1;
int blockSize = 1;
int dimSize = 1;
int stride = 1;
for (int i = 0; i < dim; i++)
blockNum *= onehot->GetDim(i);
blockSize = onehot->unitNum / blockNum;
dimSize = onehot->GetDim(dim);
for (int i = dim + 1; i < onehot->order; i++)
stride *= onehot->GetDim(i);
int blockNum = index->unitNum;
int stride = size;
int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) {
int * od = onehotData + i * stride;
int record = -1;
for (int j = 0; j < stride; j++) {
int * od = onehotData + i * blockSize + j;
int * index = indexData + i * stride + j;
int record = -1;
for (int j = 0; j < dimSize; j++) {
if (od[j*stride] != 0) {
if (record == -1)
record = j;
else
ShowNTErrors("The value of onehot tensor is illegal!");
}
if (od[j] != 0) {
if (record == -1)
record = j;
else
ShowNTErrors("The value of onehot tensor is illegal!");
}
*index = record;
}
indexData[i] = record;
}
}
/*
......@@ -122,7 +99,7 @@ convert index tensor to onehot tensor
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
{
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
......@@ -134,12 +111,9 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
onehot->SetZeroAll();
#ifdef USE_CUDA
float confidence = 1 - labelSmoothingP;
float lowconfidence = labelSmoothingP / size;
//_SetDataFixedFloat(onehot, lowconfidence);
#ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
return;
......@@ -155,7 +129,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
DTYPE * od = onehotData + i * stride;
od[id] = confidence;
od[id] = 1;
}
}
......@@ -189,4 +163,4 @@ XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
return onehot;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -110,8 +110,9 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int
DTYPE * od = onehotData + i * stride;
int id = indexData[i];
//od[id] = confidence;
//od[id] = 2.0;
//onehotData[i * stride + id] = 0.1;
if (offset == id)
od[offset] = confidence;
else{
......@@ -126,7 +127,7 @@ convert index tensor to onehot tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
{
int devID = onehot->devID;
......@@ -154,4 +155,4 @@ void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,18 +27,18 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim);
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int size);
XTensor OnehotToIndex(XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP = 0.0F);
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP = 0.0F);
XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,6 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
* I'm surprised that I did not write this file till today.
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
*/
#ifndef __SETDATA_CUH__
......@@ -29,9 +28,22 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a fixed value p (in int, float, float16, double) */
template<class T>
void _CudaSetDataFixed(XTensor * tensor, T p);
/* generate data items with a fixed value p (in int) */
void _CudaSetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
void _CudaSetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void _CudaSetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p (in float) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
/* generate data items with a fixed value p (in int) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
......@@ -43,11 +55,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
......@@ -24,19 +24,35 @@
#define __SETDATA_H__
#include "../../XTensor.h"
#include "SetData.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a xavier initialization */
void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
///* generate data items with a fixed value p */
//void _SetDataFixed(XTensor * tensor, void * valuePointer);
/* generate data items with a fixed value p */
void _SetDataFixed(XTensor * tensor, void * valuePointer);
/* generate data items with a fixed value p (in default type) */
template<class T>
void _SetDataFixed(XTensor * tensor, T value);
void SetDataFixed(XTensor &tensor, DTYPE p);
/* generate data items with a fixed value p (in integer) */
void SetDataFixedInt(XTensor &tensor, int p);
/* generate data items with a fixed value p (in int) */
void _SetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
void _SetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void _SetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
......@@ -48,11 +64,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
......@@ -170,14 +170,10 @@ SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc) \
#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, int num) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!"); \
......@@ -187,14 +183,10 @@ void _funcName(const XTensor * a, XTensor * b, int num) \
db[i] = (int)origFunc(d[i], num); \
} \
#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
#define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, float num) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
......@@ -228,34 +220,36 @@ void funcName(const XTensor &a, XTensor &b, float num) \
_funcName(&a, &b, num); \
} \
_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION_INT(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
SIMPLE_BINARY_FUNCTION_ME(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
SIMPLE_BINARY_FUNCTION_ME_INT(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION_INT(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
SIMPLE_BINARY_FUNCTION_ME(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
SIMPLE_BINARY_FUNCTION_ME_INT(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION_INT(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME_INT(_Shift, _Shift)
SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
SIMPLE_BINARY_FUNCTION_ME(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
SIMPLE_BINARY_FUNCTION_ME_INT(Mod, _Mod)
_SIMPLE_BINARY_FUNCTION_INT(_Mod, mod)
SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#endif
} // namespace nts(NiuTrans.Tensor)
......@@ -104,7 +104,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num) \
((int*)a->data, (int*)b->data, a->unitNum, num); \
} \
else { \
ShowNTErrors("TODOhaha!"); \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
......
......@@ -17,7 +17,6 @@
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
*/
#include "../../XDevice.h"
......@@ -36,20 +35,34 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border
>> size - size of the data array
*/
template <class T>
__global__
void KernelClip(T * a, T * b, T lower, T upper, int size)
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (a[i] > upper)
b[i] = upper;
else if (a[i] < lower)
b[i] = lower;
else
b[i] = a[i];
}
}
if (i < size) {
if (a[i] > upper)
b[i] = upper;
else if (a[i] < lower)
b[i] = lower;
else
b[i] = a[i];
}
/*
set each entry to its clip value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> lower - the lower border
>> upper - the upper border
>> size - size of the data array
*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
return;
}
/*
......@@ -75,27 +88,12 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
half lower1 = __float2half(lower);
half upper1 = __float2half(upper);
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT8) {
__int8 lower1 = (__int8)lower;
__int8 upper1 = (__int8)upper;
KernelClip << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, lower1, upper1, a->unitNum);
}
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -29,9 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */
template <class T>
__global__
void KernelClip(T * a, T * b, T lower, T upper, int size);
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value */
void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
......
......@@ -28,17 +28,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* compare whether every entry is equal to the specified value (cuda kernel) */
__global__
void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is equal to the specified value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
/* compare whether every entry is not equal to the specified value (cuda kernel) */
__global__
void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
/* compare whether every entry is not equal to the specified value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
/* check whether every entry is equal to the given value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
/* check whether every entry is not equal to the given value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);
#endif // USE_CUDA
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论