Commit b3a76184 by xuchen

big change! 1. modify all interface 2. modify the test case 3. merge with latest code of xiao

parent 2ed5a029
......@@ -21,12 +21,16 @@
#include <stdio.h>
#include "XNet.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h"
#include "../sample/fnnlm/FNNLM.h"
//#include <stdlib.h>
//#include <crtdbg.h>
using namespace nts;
using namespace samplefnnlm;
int main( int argc, const char ** argv )
......@@ -34,15 +38,43 @@ int main( int argc, const char ** argv )
if(argc > 1 && !strcmp(argv[1], "-test"))
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
XNet net;
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 4);
InitTensor2D(&c, 2, 4);
SetDataFixed(a, 0.1F);
a.Set2D(0.3F, 1, 0);
a.Set2D(0.4F, 1, 1);
b = Merge(a, a, 1);
c = HTanH(MMul(a, b));
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c);
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for activation function
* $Created by: XIAO Tong ( 2018-07-18
* Dingdang won 5 games in the GO training yesterday, hahaha ...
#include "XNoder.h"
#include "XBackwardFunc.h"
#include "../tensor/XName.h"
#include "../tensor/function/FHeader.h"
namespace nts{
/* compute dE/dx of a node */
void XFuncGrad::MakeGrad(XTensor * node)
XLink &income = node->income;
int operID = income.typeID;
CheckNTErrors(node->grad != NULL, "No gradient found!");
CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
XTensor * input = income.tails[0];
XTensor * output = node;
_HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_IDENTITY)
_IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SIGMOID)
_SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
ShowNTErrors("Wrong activation function type!");
/* indicates whether the node is for an activation function */
bool XFuncGrad::IsFunc(XTensor * node)
XLink &income = node->income;
return (income.typeID & FUNCTION_BASE) != 0;
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for activation function
* $Created by: XIAO Tong ( 2018-07-18
* Dingdang won 5 games in the GO training yesterday, hahaha ...
#include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h"
namespace nts{
/* this class computes the gradient for activation functions given a node */
class XFuncGrad
/* compute dE/dx of a node */
void MakeGrad(XTensor * node);
/* indicates whether the node is for an activation function */
bool IsFunc(XTensor * node);
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong ( 2018-07-17
#include "XBackwardLoss.h"
#include "../tensor/XName.h"
#include "../tensor/function/HardTanH.h"
#include "../tensor/function/LogSoftmax.h"
namespace nts{
compute dE/dx for a given function y = f(x)
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> funcID - id of the function f
>> params - parameters of the function
>> lossName - name of the loss, e.g., cross entropy
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int funcID, void * params,
CheckNTErrors(gold && y && x, "Empty input tensors!");
CheckNTErrors(dedx, "Empty gradient tensors!");
CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName);
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
ShowNTErrors("wrong function found when call the backward process!");
compute dE/dy for variable y and error(loss) function E
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> dedy - dE/dy
>> lossName - name of the loss, e.g., cross entropy
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
_LossBackward(dedy, gold, y, lossName);
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong ( 2018-07-17
* My students worked all night to prepare a submission to CWMT. Good luck
* to them!
#include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h"
namespace nts{
/* this class computes the gradient (of a output node)
with respect to the loss */
class XLossGrad
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int funcID, void * params,
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for math operations
* $Created by: XIAO Tong ( 2018-07-18
#include "XNoder.h"
#include "XBackwardMath.h"
#include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace nts{
/* compute dE/dx of a node */
void XMathGrad::MakeGrad(XTensor * node)
CheckNTErrors(node->grad != NULL, "No gradient found!");
XLink &income = node->income;
int operID = income.typeID;
if(operID == MATH_SUM)
else if(operID == MATH_MULTIPLY)
else if(operID == MATH_MATRIXMUL)
/* indicates whether the node is for a math operation */
bool XMathGrad::IsMathOP(XTensor * node)
XLink &income = node->income;
return (income.typeID & MATH_BASE) != 0;
gradient for sum
c = a + b * \beta
we have
dE/da = dE/dc
dE/db = dE/dc * \beta
>> node - the node (c) for backward computation
void XMathGrad::GradSum(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
_Sum(a->grad, node->grad, a->grad);
_Sum(b->grad, node->grad, b->grad, beta);
gradient for multiply (dot production)
c = a * b
we have
dE/da = dE/dc * b
dE/db = dE/dc * a
>> node - the node (c) for backward computation
void XMathGrad::GradMultiply(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!");
_Multiply(node->grad, b, a->grad, 1.0F);
_Multiply(node->grad, a, b->grad, 1.0F);
gradient for matrix multiply
for c = matmul(a, b) * \alpha
we have
dE/da = dE/dc * b^T * \alpha
dE/db = a^T * dE/dc * \alpha
>> node - the node (c) for backward computation
void XMathGrad::GradMatrixMul(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
DTYPE alpha = income.GetParam(2);
XTensor * dedc = node->grad;
XTensor * deda = a->grad;
XTensor * dedb = b->grad;
/* c = a * b * \alpha */
if(transA == X_NOTRANS && transB == X_NOTRANS){
/* dE/da = dE/dc * b^T * \alpha */
_MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a^T * dE/dc * \alpha */
_MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
/* c = a^T * b * \alpha */
else if(transA == X_TRANS && transB == X_NOTRANS){
/* dE/da = dE/dc * b^T * \alpha */
_MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a * dE/dc * \alpha */
_MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
/* c = a * b^T * \alpha */
else if(transA == X_NOTRANS && transB == X_TRANS){
/* dE/da = dE/dc * b * \alpha */
_MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
/* dE/db = a^T * dE/dc * \alpha */
_MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
/* c = a^T * b^T * \alpha */
else if(transA == X_TRANS && transB == X_TRANS){
/* dE/da = dE/dc * b * \alpha */
_MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
/* dE/db = a * dE/dc * \alpha */
_MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for math operations
* $Created by: XIAO Tong ( 2018-07-18
#include "../tensor/XTensor.h"
namespace nts{
/* this class computes the gradient for math operations given a node */
class XMathGrad
/* compute dE/dx of a node */
void MakeGrad(XTensor * node);
/* indicates whether the node is for a math operation */
bool IsMathOP(XTensor * node);
/* gradient for sum: c = a + b * \beta */
void GradSum(XTensor * node);
/* gradient for multiply (dot production): c = a * b */
void GradMultiply(XTensor * node);
/* gradient for matrix multiply: c = matmul(a, b) */
void GradMatrixMul(XTensor * node);
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for math operations
* $Created by: XIAO Tong ( 2018-07-19
* It was chilly when I came into the office this morning ...
* because i forgot to turn the air-condition off last night :(
#include "XNoder.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace nts{
/* compute dE/dx of a node */
void XShapeGrad::MakeGrad(XTensor * node)
CheckNTErrors(node->grad != NULL, "No gradient found!");
XLink &income = node->income;
int operID = income.typeID;
if(operID == SHAPE_MERGE)
else if(operID == SHAPE_MERGE_LIST)
else if(operID == SHAPE_UNSQUEEZE)
/* indicates whether the node is for a math operation */
bool XShapeGrad::IsShapeOP(XTensor * node)
XLink &income = node->income;
return (income.typeID & DATA_BASE) != 0;
gradient for merge
c = merge(a_0, a_1, ...)
where a_i is the i-th block in a tensor a
we have
dE/da_0 = dE/dc_{split_0}
dE/db_1 = dE/dc_{split_1}
dE/da = split(dE/dc)
>> node - the node (c) for backward computation
void XShapeGrad::GradMerge(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 0, "Wrong input tensor number for MERGE!");
XTensor * input = income.tails[0];
int whereToMerge = income.GetParamInt(0);
int leadDim = income.GetParamInt(1);
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < input->order; i++){
if(i < leadDim)
blockNum *= input->dimSize[i];
blockSize = input->GetDataSizeInChar() / blockNum;
int * dims = new int[input->order];
for(int i = 0, j = 0; i < input->order; i++){
if(i >= leadDim){
dims[j++] = input->dimSize[i];
dims[0] = -dims[0];
XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio,
input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0];
XTensor gradNodeSmall(node->order - leadDim, dims,
node->dataType, node->denseRatio,
node->devID, node->mem);
/* we can simply split the gradient tensor
if the input is used in merging only */
if(input->outgo.tailNum == 1){
for(int i = 0; i < blockNum; i++){ = (char*)node->grad->data + i * blockSize; = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim, input->dimSize[leadDim]);
/* a more complicated case is that the input tensor is used for
other operations somewhere else. So we have to do gradient
accumulation after spliting, i.e., we need an additional
SUM operation */
XTensor gradInputSmallBuf(&gradInputSmall);
for(int i = 0; i < blockNum; i++){ = (char*)node->grad->data + i * blockSize; = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim, input->dimSize[leadDim]);
_Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
} = NULL; = NULL;
delete[] dims;
gradient for merging a list of tensors
c = merge(list(a, b, ...))
where a, b ... are of the same size
we have
dE/da = dE/dc_{split_0}
dE/db = dE/dc_{split_1}
list(dE/da, dE/db, ...) = split(dE/dc)
>> node - the node (c) for backward computation
void XShapeGrad::GradMergeList(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
XTensor * last = NULL;
XList smalls(income.tailNum);
XList smallsGrad(income.tailNum);
bool mergeOnly = true;
for(int i = 0; i < income.tailNum; i++){
XTensor * tail = income.tails[i];
if(i > 1){
CheckNTErrors(XTensor::IsIdentical(last, tail),
"Input tensors must be of the same size!");
if(tail->outgo.tailNum > 1)
mergeOnly = false;
last = tail;
int whereToMerge = income.GetParamInt(0);
/* we can simply split the gradient tensor into the input tensors
if the inputs are used in merging only */
_Split(node->grad, &smallsGrad, whereToMerge, smalls.count);
/* a more complicated case is that the input tensors are used for
other operations somewhere else. So we have to do gradient
accumulation after spliting, i.e., we need an additional
SUM operation */
int * dims = new int[last->order + 1];
dims[0] = smalls.count;
for(int i = 0; i < last->order; i++)
dims[i + 1] = last->dimSize[i];
XTensor gradSplit(last->order + 1, dims,
last->dataType, last->denseRatio,
last->devID, last->mem);
_Split(node->grad, &gradSplit, whereToMerge, smalls.count);
memcpy(dims, last->dimSize, sizeof(int) * last->order);
dims[0] = -dims[0];
XTensor gradSmall(last->order, dims,
last->dataType, last->denseRatio,
last->devID, last->mem);
/* gradient accumulation for each split */
for(int i = 0; i < smalls.count; i++){
XTensor * inputGrad = (XTensor*)smallsGrad.Get(i); = (char*) + i * last->unitNum * last->unitSize;
_Sum(inputGrad, &gradSmall, inputGrad);
} = NULL;
delete[] dims;
gradient for unsqueezing a tensor
c = unsqueeze(a)
we have
dE/da = reduecesum(dE/dc)
>> node - the node (c) for backward computation
void XShapeGrad::GradUnsqueeze(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");
XTensor * output = node;
XTensor * input = income.tails[0];
int dim = income.GetParamInt(0);
int dSize = income.GetParamInt(1);
CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
_ReduceSum(output->grad, input->grad, dim);
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* backward computation for shaping and data movement
* $Created by: XIAO Tong ( 2018-07-18
#include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h"
namespace nts{
/* this class computes the gradient for tensor shaping and movement given a node */
class XShapeGrad
/* compute dE/dx of a node */
void MakeGrad(XTensor * node);
/* indicates whether the node is for a shaping operation */
bool IsShapeOP(XTensor * node);
/* gradient for merge: c = merge(a, b, ...) */
void GradMerge(XTensor * node);
/* gradient for merging a list of tensors : c = merge(list(a, b, ...)) */
void GradMergeList(XTensor * node);
/* gradient for unsqueezing a tensor : c = unsqueeze(a) */
void GradUnsqueeze(XTensor * node);
\ No newline at end of file
......@@ -20,6 +20,12 @@
#include "XNet.h"
#include "XNoder.h"
#include "XBackwardLoss.h"
#include "XBackwardMath.h"
#include "XBackwardFunc.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
namespace nts{
......@@ -78,6 +84,22 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
backward propagation to obtain gradient
>> root - root node (output) of the network
>> loss - name of loss function
void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
XList roots(1);
XList golds(1);
Backward(roots, golds, loss);
backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
>> root - a list of root nodes (output) of the network
......@@ -87,6 +109,85 @@ with a number of root nodes
void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i);
node->visitMark = NODE_UNFINISHED;
XLossGrad lossGrad;
/* we start with the gradient with respect to the loss for output layers */
for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i);
XLink &income = root->income;
int funcID = income.typeID;
void * params = income.params;
/* we compute dE/dx if the output is generated by an activation function y = f(x).
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
XTensor * x = income.tails[0];
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
root->visitMark = NODE_FINISHED;
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
lossGrad.Compute(gold, root, root->grad, loss);
/* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);
if(node->visitMark == NODE_FINISHED)
backward propagation to obtain gradient
with a number of root nodes
>> root - a list of root nodes (output) of the network
>> loss - name of loss function
void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
XList golds(roots.count);
for(int i = 0; i < roots.count; i++)
Backward(roots, golds, loss);
backward computation for a given node
>> node - the node keeps the result of an operation (e.g., activation function)
void XNet::BackwardNode(XTensor * node)
if(node == NULL || node->visitMark == NODE_FINISHED)
else if(XFuncGrad::IsFunc(node))
else if(XShapeGrad::IsShapeOP(node))
ShowNTErrors("Wrong node type!");
node->visitMark = NODE_FINISHED;
......@@ -115,6 +216,15 @@ void XNet::Traverse(XList &roots)
for (int i = 0; i < roots.count; i++)
TarjanVisit((XTensor*)roots.Get(i), nodes, id);
for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i);
......@@ -145,4 +255,22 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
dump network information
>> file - the file for dumping
void XNet::Dump(FILE * file)
for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i);
fprintf(file, "node %d: %d\n", i, node->id);
node->Dump(file, "tensor: ");
if(node->grad != NULL)
node->grad->Dump(file, "grad: ");
fprintf(file, "no gradient!\n");
fprintf(file, "\n");
\ No newline at end of file
......@@ -57,11 +57,21 @@ struct XNet
void Clear();
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold = NULLTensor, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(XList &roots, XList &golds = NULLList, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */
void BackwardNode(XTensor * node);
/* traverse the net and find the topological order by
depth-first search (Tarjan's algorithm) */
......@@ -73,6 +83,9 @@ struct XNet
/* depth-first search given a node (Tarjan's algorithm for topological ordering) */
void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
/* dump network information */
void Dump(FILE * file);
/* we make a unique id for every tensor */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong ( 2018-07-18
#include "XNoder.h"
namespace nts{
/* make gradient tensor for a node */
void XNoder::MakeGrad(XTensor * node)
if(node == NULL)
if(!XTensor::IsIdentical(node, node->grad)){
delete node->grad;
node->grad = NewTensor(node);
/* the node is a leaf node (intput) or not */
bool XNoder::IsLeaf(XTensor * node)
if(node == NULL)
return false;
if(node->income.tailNum == 0)
return true;
return false;
/* the node is a root node (output) or not */
bool XNoder::IsRoot(XTensor * node)
if(node == NULL)
return false;
if(node->outgo.tailNum == 0)
return true;
return false;
/* the node keeps the gradinent or not */
bool XNoder::IsGrad(XTensor * node)
if(node == NULL)
return false;
return true;
return false;
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* low-level utilities
* $Created by: XIAO Tong ( 2018-07-18
#include "../tensor/XTensor.h"
#ifndef __XNODER_H__
#define __XNODER_H__
namespace nts{
#define NODE_DOING 1
/* node management */
class XNoder
/* make gradient tensor for a node */
void MakeGrad(XTensor * node);
/* the node is a leaf node (intput) or not */
bool IsLeaf(XTensor * node);
/* the node is a root node (output) or not */
bool IsRoot(XTensor * node);
/* the node keeps the gradinent or not */
bool IsGrad(XTensor * node);
\ No newline at end of file
......@@ -27,10 +27,11 @@
#include <math.h>
#include "FNNLM.h"
#include "../../XGlobal.h"
#include "../../XUtility.h"
#include "../../XDevice.h"
#include "../../function/FHeader.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/function/FHeader.h"
#include "../../network/XNet.h"
namespace samplefnnlm
......@@ -50,6 +51,7 @@ float minmax = 0.08F; // range [-p,p] for parameter initializati
int sentBatch = 0; // batch size at the sentence level
int wordBatch = 1; // batch size at the word level
bool shuffled = false; // shuffled the training data file or not
bool autoDiff = false; // indicator of automatic differentiation
void LoadArgs(int argc, const char ** argv, FNNModel &model);
void Init(FNNModel &model);
......@@ -59,7 +61,7 @@ void Clear(FNNModel &model);
void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
void Train(const char * train, bool isShuffled, FNNModel &model);
void Update(FNNModel &model, FNNModel &grad, float epsilon);
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
void Dump(const char * fn, FNNModel &model);
void Read(const char * fn, FNNModel &model);
......@@ -71,6 +73,8 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net);
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, XNet &net);
entry of the program
......@@ -98,6 +102,7 @@ arguments:
-devid D: the id of the device used
-1: GPU, >=0: GPUs
-mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training
where S=string, D=integer and F=float.
All words in the training and test data files
......@@ -182,6 +187,8 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
wordBatch = atoi(argv[i + 1]);
if(!strcmp(argv[i], "-shuffle"))
shuffled = true;
if(!strcmp(argv[i], "-autodiff"))
autoDiff = true;
if(!strcmp(argv[i], "-dev") && i + 1 < argc)
model.devID = atoi(argv[i + 1]);
......@@ -350,6 +357,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
FNNModel grad;
Copy(grad, model);
/* XNet for automatic differentiation */
XNet autoDiffer;
double startT = GetClockSec();
/* iterate for a number of epochs */
......@@ -380,9 +390,6 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* the gold standard */
XTensor gold;
/* prepare an empty network for building the fnn */
FNNNet net;
/* make the input tensor for position i */
for(int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
......@@ -390,6 +397,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
/* prepare an empty network for building the fnn */
FNNNet net;
/* gradident = 0 */
......@@ -400,7 +411,15 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
/* update model parameters */
Update(model, grad, learningRate);
Update(model, grad, learningRate, false);
/* forward + backward process */
FBInOne(inputs, output, gold, CROSSENTROPY, model, autoDiffer);
/* update model parameters */
Update(model, grad, learningRate, true);
/* get probabilities */
float prob = GetProb(output, gold);
......@@ -442,26 +461,45 @@ update the model parameters using the delta rule
>> model - the model to update
>> grad - gradients
>> epsilon - learning rate
>> isNodeGrad - indicates whether the gradient is associated with the node
void Update(FNNModel &model, FNNModel &grad, float epsilon)
void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
XList paraList(10);
XList gradList(10);
for (int i = 0; i < model.hDepth; i++) {
for (int i = 0; i < model.hDepth; i++) {
for (int i = 0; i < model.hDepth; i++) {
for (int i = 0; i < paraList.count; i++) {
XTensor * para = (XTensor*)paraList.GetItem(i);
......@@ -773,7 +811,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......@@ -818,7 +856,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
_HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
......@@ -864,6 +902,55 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
forward + backward in one procedure
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold,
LOSS_FUNCTION_NAME loss, FNNModel &model, XNet &net)
int batchSize = gold.GetDim(0);
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
XList inputList(n - 1);
for(int i = 0; i < n - 1; i++)
inputList.Add(inputs + i);
/* represent n - 1 words in one tensor */
words = Merge(inputList, 0);
/* word embedding */
embeddingBig = MMul(words, model.embeddingW);
/* input of the first hidden layer */
hidden = Split(embeddingBig, 0, n - 1);
hidden = Merge(hidden, 2, 0);
/* hidden layers */
for(int i = 0; i < depth; i++){
b = Unsqueeze(model.hiddenB[i], 1, batchSize);
hidden = MMul(hidden, model.hiddenW) + b;
b = Unsqueeze(model.outputB, 1, batchSize);
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
/* automatic differentiation */
dump the model to the disk space
>> fn - where to keep the model
>> model - the fnn model
......@@ -30,9 +30,9 @@
#ifndef __FNNLM_H__
#define __FNNLM_H__
#include "../../XGlobal.h"
#include "../../XTensor.h"
#include "../../core/CHeader.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
using namespace nts;
......@@ -28,7 +28,6 @@
#include <time.h>
#include "XTensor.h"
#include "XDevice.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./test/Test.h"
......@@ -36,7 +35,6 @@
//#include <crtdbg.h>
using namespace nts;
using namespace samplefnnlm;
void SmallTest();
......@@ -45,21 +43,17 @@ int main( int argc, const char ** argv )
/* a tiny test */
return 0;
//return 0;
if(argc > 1 && !strcmp(argv[1], "-test"))
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
fprintf(stderr, "use of tensors. All you need is to ... \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
......@@ -25,7 +25,7 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
int XLink::paramSize = 64;
int XLink::paramSize = PARAM_UNTI_SIZE;
/* constuctor */
......@@ -114,6 +114,8 @@ void XLink::ClearOutgoing(XTensor * node)
outgo.typeID = 0;
outgo.type[0] = 0;
delete[] (char*)outgo.params;
outgo.params = NULL;
......@@ -152,6 +154,8 @@ void XLink::ClearIncoming(XTensor * node)
income.typeID = 0;
income.type[0] = 0;
delete[] (char*)income.params;
income.params = NULL;
......@@ -210,7 +214,7 @@ add a parameter
void XLink::AddParam(DTYPE param)
void * ps = params;
params = new char[paramNum + 1];
params = new char[(paramNum + 1) * paramSize];
memcpy(params, ps, paramNum * paramSize);
DTYPE * p = (DTYPE*)((char*)params + paramNum * paramSize);
*p = param;
......@@ -226,7 +230,7 @@ add a parameter
void XLink::AddParam(void * param, int size)
void * ps = params;
params = new char[paramNum + 1];
params = new char[(paramNum + 1) * paramSize];
memcpy(params, ps, paramNum * paramSize);
char * p = (char*)params + paramNum * paramSize;
memcpy(p, param, size);
......@@ -235,6 +239,42 @@ void XLink::AddParam(void * param, int size)
get a paramter in default type
>> i - id of the parameter
<< return - the parameter in default type
DTYPE XLink::GetParam(int i)
CheckNTErrors(params != NULL, "parameter array cannot be empty!");
char * p = (char*)params + i * paramSize;
return *(DTYPE*)p;
get a paramter in integer
>> i - id of the parameter
<< return - the parameter in integer
int XLink::GetParamInt(int i)
CheckNTErrors(params != NULL, "parameter array cannot be empty!");
char * p = (char*)params + i * paramSize;
return *(int*)p;
get a parameter in MATRIX_TRANS_TYPE
>> i - id of the parameter
<< return - the parameter in MATRIX_TRANS_TYPE
MATRIX_TRANS_TYPE XLink::GetParamTrans(int i)
CheckNTErrors(params != NULL, "parameter array cannot be empty!");
char * p = (char*)params + i * paramSize;
return *(MATRIX_TRANS_TYPE*)p;
create a hyperedge with two input tensors and a output tensor
>> t1 - a tail tensor
>> t2 - another tail tensor
......@@ -288,14 +328,44 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
create a hyper edge with a input tensors and a list of output tensors
>> h - a input tensor
>> list - a list of output tensors
>> id - id of the edge type
void XLink::MakeLink(XTensor * t, XList * list, int id)
/* forward */
for(int i = 0; i < list->count; i++){
XTensor * h = (XTensor*)list->GetItem(i);
if(h == NULL)
XLink &income = h->income;
/* backward */
XLink &outgo = t->outgo;
CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
for(int i = 0; i < list->count; i++){
XTensor * t = (XTensor*)list->GetItem(i);
if(t == NULL)
add parameters
>> h - head
>> param - parameter we want introduce
void XLink::AddParamToHead(XTensor * h, DTYPE param)
if(h != NULL)
CheckNTErrors(h != NULL, "head tensor cannot be empty!");
......@@ -306,8 +376,7 @@ add an integer parameter
void XLink::AddParamToHeadInt(XTensor * h, int param)
if(h != NULL)
CheckNTErrors(h != NULL, "head tensor cannot be empty!");
h->income.AddParam(&param, sizeof(int));
......@@ -318,8 +387,7 @@ add a MATRIX_TRANS_TYPE parameter
void XLink::AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param)
if(h != NULL)
CheckNTErrors(h != NULL, "head tensor cannot be empty!");
h->income.AddParam(&param, sizeof(MATRIX_TRANS_TYPE));
......@@ -376,6 +444,11 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
newIncome.tailNum = oldOne->income.tailNum;
memcpy(newIncome.tails, oldOne->income.tails, sizeof(XTensor*) * newIncome.tailNum);
int paraArraySize = oldOne->income.paramNum * oldOne->income.paramSize;
newIncome.params = new char[paraArraySize];
memcpy(newIncome.params, oldOne->income.params, paraArraySize);
newIncome.paramNum = oldOne->income.paramNum;
/* update the link to each child node */
for(int i = 0; i < newIncome.tailNum; i++){
XTensor * child = newIncome.tails[i];
......@@ -34,6 +34,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
struct XTensor;
#define PARAM_UNTI_SIZE 64
This defines the link among tensors in networks. XLink can be
......@@ -115,12 +116,21 @@ struct XLink
/* add two tails in one time */
void AddTwoTails(XTensor * t1, XTensor * t2);
/* add a integer parameter */
/* add a parameter in default type */
void AddParam(DTYPE param);
/* add a integer parameter */
/* add a parameter */
void AddParam(void * param, int size);
/* get a paramter in default type */
DTYPE GetParam(int i);
/* get a paramter in integer */
int GetParamInt(int i);
/* get a parameter in MATRIX_TRANS_TYPE */
MATRIX_TRANS_TYPE GetParamTrans(int i);
/* create a hyper edge with two input tensors and a output tensor */
void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
......@@ -129,6 +139,10 @@ struct XLink
void MakeLink(const XList * list, XTensor * h, int id);
/* create a hyper edge with a input tensors and a list of output tensors */
void MakeLink(XTensor * h, XList * list, int id);
/* add a parameter */
void AddParamToHead(XTensor * h, DTYPE param);
......@@ -26,57 +26,81 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* get operator name */
const char * GetOPName(int type)
if((type & MATH_ARITHMETIC) != 0){
if(type == MATH_ABSOLUTE)
if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE";
else if(type == MATH_MATRIXMUL)
else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if(type == MATH_MULTIPLY)
else if (type == MATH_MULTIPLY)
return "M_MULTIPLY";
else if(type == MATH_NEGATE)
else if (type == MATH_NEGATE)
return "M_NEGATE";
else if(type == MATH_SIGN)
else if (type == MATH_SIGN)
return "M_SIGN";
else if(type == MATH_SUM)
else if (type == MATH_SUM)
return "M_SUM";
else if(type == MATH_LOG)
return "M_NORMALIZE";
else if(type == MATH_NORMALIZE)
else if (type == MATH_LOG)
return "M_LOG";
else if(type == MATH_POWER)
else if (type == MATH_NORMALIZE)
return "M_NORMALIZE";
else if (type == MATH_POWER)
return "M_POWER";
else if(type == MATH_SCALEANDSHIFT)
else if (type == MATH_SCALEANDSHIFT)
else if(type == GETANDSET_SELECT)
return "G_SELECT";
else if(type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
else if(type == REDUCE_REDUCEMAX)
else if (type == REDUCE_REDUCEMAX)
return "R_REDUCEMAX";
else if(type == REDUCE_REDUCEMEAN)
else if (type == REDUCE_REDUCEMEAN)
return "R_REDUCEMEAN";
else if(type == REDUCE_REDUCESUM)
else if (type == REDUCE_REDUCESUM)
return "R_REDUCESUM";
else if(type == SHAPE_CONCATENATE)
else if ((type & DATA_BASE) != 0){
return "G_SELECT";
else if (type == MOVEMENT_COPYINDEXED)
else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
else if (type == SHAPE_CONCATENATE)
else if(type == SHAPE_MERGE)
else if (type == SHAPE_MERGE)
return "S_MERGE";
else if(type == SHAPE_PERMUTE)
else if (type == SHAPE_MERGE_LIST)
return "S_MERGE_LIST";
else if (type == SHAPE_PERMUTE)
return "S_PERMUTE";
else if(type == SHAPE_SPLIT)
else if (type == SHAPE_SPLIT)
return "S_SPLIT";
else if(type == SHAPE_TRANSPOSE)
else if (type == SHAPE_SPLIT_LIST)
return "S_SPLIT_LIST";
else if (type == SHAPE_TRANSPOSE)
return "S_TRANSPOSE";
else if(type == SHAPE_UNSQUEEZE)
else if (type == SHAPE_UNSQUEEZE)
return "S_UNSQUEEZE";
else if (type == SORT_SORT)
return "S_SORT";
else if (type == SORT_TOPK)
return "S_TOPK";
else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH)
return "F_HARDTANH";
else if (type == FUNC_IDENTITY)
return "F_IDENTITY";
else if (type == FUNC_LOGSOFTMAX)
return "F_LOGSOFTMAX";
else if (type == FUNC_RECTIFY)
return "F_RECTIFY";
else if (type == FUNC_SIGMOID)
return "F_SIGMOID";
else if (type == FUNC_SOFTMAX)
return "F_SOFTMAX";
return "NULL";
......@@ -28,8 +28,9 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_ARITHMETIC 0x00001000
/* math operations */
#define MATH_BASE 0x00001000
......@@ -42,28 +43,45 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* data and shape related operations */
#define SORT_SORT SORT + 1
/* activation functions */
/* get operator name */
const char * GetOPName(int type);
......@@ -134,8 +134,6 @@ constructor
XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
const float myDenseRatio, int myDevID, XMem * myMem)
CheckNTErrors((myOrder > 0), "Illegal tensor order1");
......@@ -144,6 +142,7 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
mem = myMem;
devID = myMem != NULL ? myMem->devID : myDevID;
if(order >= 0)
Resize(myOrder, myDimSize, myDataType, myDenseRatio);
......@@ -211,6 +210,9 @@ XTensor::~XTensor()
if(grad != NULL)
delete grad;
/* initialize member variables */
......@@ -237,7 +239,9 @@ void XTensor::Init()
memset(isAllValued, 0, sizeof(bool) * MAX_TENSOR_DIM_NUM);
isInit = false;
isTmp = false;
isGrad = false;
visitMark = 0;
grad = NULL;
/* delete data arrays */
......@@ -294,7 +298,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
devID = tensor.devID;
mem = tensor.mem;
......@@ -347,6 +351,9 @@ judge whether the two matrices are in the same type and size
bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
if(a == NULL || b == NULL)
return false;
if(a->order != b->order)
return false;
......@@ -1043,7 +1050,7 @@ int XTensor::GetNonzeroSize()
set the tensor as "temporary"
>> myIsTMP - flag
>> myIsTMP - the flag
void XTensor::SetTMP(bool myIsTmp)
......@@ -1051,6 +1058,15 @@ void XTensor::SetTMP(bool myIsTmp)
set the tensor as "keep-gradient"
>> myIsGrad - the flag
void XTensor::SetGrad(bool myIsGrad)
isGrad = myIsGrad;
resize a tensor with a specified tensor size
>> myOrder - order of the tensor
>> myDimSize - the size of each dimension
......@@ -1105,7 +1121,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
for sparse matrices, we use a list of tuple (key, value),
ordered by key. Take a (2-dimensional) matrice as examples,
ordered by key. Take a (2-dimensional) matrix as an example,
we have key = m * i + j;
The data array is
......@@ -1148,9 +1164,9 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
/* allocate the new one */
if(mem == NULL){
data = (void*)new char[unitNum * unitSize];
data = XMemAlloc(devID, unitNum * unitSize);
#if defined(UNSAFE_BUT_FAST_MEM)
memset(data, 0, unitNum * unitSize);
XMemSet(devID, data, 0, unitNum * unitSize);
......@@ -1982,9 +1998,11 @@ generate a copy of XTensor
XTensor * NewTensor(XTensor * a, bool isFilledData)
CheckNTErrors((a != NULL), "Empty input!");
int * dims = new int[a->order];
if(a->order > 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order);
......@@ -1994,8 +2012,6 @@ XTensor * NewTensor(XTensor * a, bool isFilledData)
a->dataType, a->denseRatio,
a->devID, a->mem);
delete[] dims;
return newTensor;
......@@ -139,9 +139,15 @@ public:
/* indicates whether the tensor is created temporarily */
bool isTmp;
/* indicates whether the tensor keeps the gradient when used as model parameters */
bool isGrad;
/* mark for traversing the gragh */
unsigned int visitMark;
/* gradient (for back-propagation) */
XTensor * grad;
the link used to form networks. Note that when we compute on tensors, we actually create a
network where nodes are tensors and edges the connections among them. Each connection is
......@@ -300,6 +306,9 @@ public:
/* set the tensor as "temporary" */
void SetTMP(bool myIsTmp = true);
/* set the tensor as "keep-gradient" */
void SetGrad(bool myIsGrad = true);
/* resize a matrix with a specified matrix size */
bool Resize(const int myOrder, const int * myDimSize,
......@@ -32,9 +32,6 @@
#define USE_PTHREAD // for linux
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
// neccessary libs
......@@ -46,6 +43,9 @@ namespace nts{
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#if(defined(_WIN32) && !defined (__CYGWIN__))
#define BEGINTHREAD(src, stack, func, arg, flag, id) \
......@@ -176,12 +176,16 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
#ifdef USE_CUDA
else if(devIDT >= 0 && devIDS < 0){
CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyHostToDevice) == cudaSuccess),
"cudaMemcpy error (cudaMemcpyHostToDevice)");
cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
else if(devIDT < 0 && devIDS >= 0){
CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost) == cudaSuccess),
"cudaMemcpy error (cudaMemcpyDeviceToHost)");
cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
//if(devIDT == devIDS){
......@@ -482,8 +486,9 @@ quick sorting
NOTE: this means that the items may not placed in a continuous memory space
>> comp - the comparison function
void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
XMemCopy(dataB, -1, dataA, -1, num * width);
char *lo, *hi; // ends of sub-array currently sorting
int *indexlo, *indexhi;
char *mid; // points to middle of subarray
......@@ -502,8 +507,8 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
stackptr = 0;
lo = (char*)data;
hi = (char*)data + realStride * (num - 1);
lo = (char*)dataB;
hi = (char*)dataB + realStride * (num - 1);
indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
......@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
extern double GetClock();
extern double GetClockSec();
extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA
......@@ -21,6 +21,7 @@
#include <math.h>
#include "../../XTensor.h"
#include "../../XName.h"
#include "Absolute.h"
#include "Absolute.cuh"
......@@ -28,21 +29,54 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its absolute value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
void _Absolute(XTensor * a)
void _Absolute(const XTensor * a, XTensor * b)
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaAbsolute(a, b);
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)fabs(d[i]);
db[i] = (DTYPE)fabs(d[i]);
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
void _AbsoluteMe(XTensor * a)
_Absolute(a, a);
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the absolute value of input tensor
XTensor Absolute(const XTensor & a)
XTensor b(&a);
/* call _Absolute function */
_Absolute(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
return b;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
set each entry to its absolute value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelAbsolute(DTYPE * d, int size)
void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = fabs(d[i]);
b[i] = fabs(a[i]);
set each entry to its absolute value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelAbsolute(__half * d, int size)
void KernelAbsolute(__half * a, __half * b, int size)
set each entry to its with float16 data type value
>> a - the tensor
set each entry to its absolute value
>> a - input tensor
>> b - output tensor
extern "C"
void _CudaAbsolute(XTensor * a)
void _CudaAbsolute(const XTensor * a, XTensor * b)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -74,10 +78,10 @@ void _CudaAbsolute(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
else if (a->dataType == X_FLOAT16) {
KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
else {
......@@ -27,15 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its absolute value (CUDA Kernel) */
void KernelAbsolute(DTYPE * d, int size);
void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
void KernelAbsolute(__half * d, int size);
void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
extern "C"
void _CudaAbsolute(XTensor * a);
void _CudaAbsolute(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its absolute value */
extern "C"
void _Absolute(XTensor * a);
void _Absolute(const XTensor * a, XTensor * b);
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
void _AbsoluteMe(XTensor * a);
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
XTensor Absolute(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......@@ -41,8 +41,8 @@ void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
CheckNTErrors((a && b && c), "Empty input lists!");
CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0)
......@@ -28,8 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */
extern "C"
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
} // namespace nts(NiuTrans.Tensor)
......@@ -30,7 +30,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
......@@ -66,8 +66,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
CheckNTErrors((am == bn && an == cn && bm == cm),
"Unmatched tensors in multiplication!");
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
......@@ -80,13 +79,13 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
aBlockNum *= a->dimSizeRDI[i];
cBlockNum *= a->dimSizeRDI[i];
for (int i = 2; i < b->order; i++) {
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i];
cBlockNum *= b->dimSizeRDI[i];
......@@ -186,7 +185,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
matrix multiplication (return a XTensor structure)
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
......@@ -203,14 +202,13 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
......@@ -224,10 +222,10 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -236,14 +234,65 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHead(&c, beta);
/* destroy variables */
delete[] dimSize;
return c;
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
/* call _MatrixMul function */
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
......@@ -26,8 +26,10 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MMul MatrixMul
matrix multiplication
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n.
......@@ -42,7 +44,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
matrix multiplication (return a XTensor structure)
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor c to keep the result and return it
For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions.
......@@ -55,7 +57,12 @@ C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......@@ -123,8 +123,7 @@ where trans() return the transposed matrix if the flag is fired
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c,
DTYPE alpha, DTYPE beta, XStream * stream)
XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
......@@ -158,8 +157,11 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasSetStream(*handle, stream->stream);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1],
b->dimSize[0], b->dimSize[1],
c->dimSize[0], c->dimSize[1],
alpha, beta);
else {
......@@ -32,8 +32,8 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired.
extern "C"
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......@@ -47,8 +47,7 @@ where trans() returns the transposed matrix if the flag is fired
void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner)
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
......@@ -156,6 +155,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
......@@ -168,14 +168,12 @@ where trans() returns the transposed matrix if the flag is fired.
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
DTYPE alpha, XPRunner * parallelRunner)
CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
......@@ -190,24 +188,23 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
/*call _MatrixMulBatched function */
_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHead(&c, beta);
/* destroy variables */
delete[] dimSize;
......@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication of the two tensors
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
......@@ -38,7 +38,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const X
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
matrix multiplication of the two tensors (return a XTensor structure)
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
......@@ -47,7 +47,7 @@ ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......@@ -142,16 +142,15 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
element-wise product of two tensors (return a XTensor structure)
make a new tensor c to keep the result and return it
c(i) = a(i)*b(i) + \alpha * c(i)
c(i) = a(i)*b(i)
where i is the index of the item
>> a - tensor a
>> b - tensor b
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
......@@ -159,11 +158,10 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
/* call _Multiply function */
_Multiply(&a, &b, &c, alpha, leadingDim);
_Multiply(&a, &b, &c, 0, leadingDim);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
return c;
......@@ -44,10 +44,10 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim
element-wise product of two tensors (return a XTensor structure)
make a new tensor to keep the result and return it
c(i) = a(i)*b(i) + \alpha * c(i)
c(i) = a(i)*b(i)
where i is the index of the element
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
} // namespace nts(NiuTrans.Tensor)
......@@ -20,6 +20,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "Negate.h"
#include "Negate.cuh"
......@@ -27,21 +28,55 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its minus value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
void _Negate(XTensor * a)
void _Negate(const XTensor * a, XTensor * b)
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaNegate(a, b);
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = -d[i];
db[i] = -d[i];
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
void _NegateMe(XTensor * a)
_Negate(a, a);
set every entry to its minus value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the minus value of input tensor
XTensor Negate(const XTensor & a)
XTensor b(&a);
/* call _Negate function */
_Negate(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
return b;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,45 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
set each entry to its negtive value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
void KernelNegate(DTYPE * d, int size)
void KernelNegate(DTYPE * a, DTYPE * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = -d[i];
b[i] = -a[i];
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
void KernelNegate(__half * d, int size)
void KernelNegate(__half * a, __half * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
d[i] = __hsub(__float2half(0), d[i]);
b[i] = __hsub(__float2half(0), a[i]);
if (i < size)
d[i] = __float2half(-__half2float(d[i]));
b[i] = __float2half(-__half2float(a[i]));
set each entry to its negtive value
>> a - the tensor
>> a - input tensor
>> b - output tensor
extern "C"
void _CudaNegate(XTensor * a)
void _CudaNegate(const XTensor * a, XTensor * b)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -82,10 +86,10 @@ void _CudaNegate(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelNegate << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
else if (a->dataType == X_FLOAT16) {
KernelNegate << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
else {
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its negtive value (CUDA Kernel) */
void KernelNegate(DTYPE * d, int size);
void KernelNegate(DTYPE * a, DTYPE * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
void KernelNegate(__half * d, int size);
void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */
extern "C"
void _CudaNegate(XTensor * a);
void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its minus value */
extern "C"
void _Negate(XTensor * a);
void _Negate(const XTensor * a, XTensor * b);
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
void _NegateMe(XTensor * a);
set every entry to its minus value (return a XTensor structure)
make a new tensor to keep the result and return it
XTensor Negate(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......@@ -20,6 +20,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "Sign.h"
#include "Sign.cuh"
......@@ -27,27 +28,60 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its sign value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
void _Sign(XTensor * a)
void _Sign(const XTensor * a, XTensor * b)
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaSign(a, b);
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > 0)
d[i] = 1.0F;
db[i] = 1.0F;
else if (d[i] == 0)
d[i] = 0.0F;
db[i] = 0.0F;
d[i] = -1.0F;
db[i] = -1.0F;
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
void _SignMe(XTensor * a)
_Sign(a, a);
set every entry to its sign value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the sign value of the input tensor
XTensor Sign(const XTensor & a)
XTensor b(&a);
/* call _ScaleAndShift function */
_Sign(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
return b;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,43 +29,47 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
set each entry to its sign value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelSign(DTYPE * d, int size)
void KernelSign(DTYPE * a, DTYPE * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] > 0)
d[i] = 1.0F;
else if (d[i] == 0)
d[i] = 0.0F;
if (a[i] > 0)
b[i] = 1.0F;
else if (a[i] == 0)
b[i] = 0.0F;
d[i] = -1.0F;
b[i] = -1.0F;
set each entry to its sign value (CUDA Kernel)
set each entry to its sign value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelSign(__half * d, int size)
void KernelSign(__half * a, __half * b, int size)
set each entry to its with float16 data type value
>> a - the tensor
set each entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
extern "C"
void _CudaSign(XTensor * a)
void _CudaSign(const XTensor * a, XTensor * b)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -80,10 +84,10 @@ void _CudaSign(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
else if (a->dataType == X_FLOAT16) {
KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
else {
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its sign value (CUDA Kernel) */
void KernelSign(DTYPE * d, int size);
void KernelSign(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
void KernelSign(__half * d, int size);
void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */
extern "C"
void _CudaSign(XTensor * a);
void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its sign value */
extern "C"
void _Sign(XTensor * a);
void _Sign(const XTensor * a, XTensor * b);
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
void _SignMe(XTensor * a);
set every entry to its sign value (return a XTensor structure)
make a new tensor to keep the result and return it
XTensor Sign(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */
extern "C"
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA
......@@ -46,7 +47,8 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC,
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
......@@ -54,11 +56,13 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
......@@ -96,7 +96,6 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
int order = a.order;
int * dimSize = new int[order];
CheckNTErrors(&a != NULL, "Empty input tensors!");
CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
CheckNTErrors(low < high, "Illegal range specified!");
......@@ -110,8 +109,8 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
dimSize[i] = a.dimSize[i];
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
/* call _SelectRange function */
......@@ -21,6 +21,8 @@
#include "SetData.h"
#include "SetData.cuh"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#if !defined( WIN32 ) && !defined( _WIN32 )
......@@ -36,12 +38,150 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
generate data items with a fixed value p
>> tensor - the tensor whose data array would be initialized
>> p - pointer to the number for initializing the tensor
void _SetDataFixed(XTensor * tensor, void * valuePointer)
int num = tensor->unitNum;
if(tensor->dataType == X_INT){
int p = *(int*)valuePointer;
if(tensor->devID < 0){
int * d = (int*)tensor->data;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
for(int i = 0; i < num; i++)
d[i] = p;
#ifdef USE_CUDA
CudaSetDataFixedInt(tensor, p);
else if(tensor->dataType == X_FLOAT){
float p = *(float*)valuePointer;
if(tensor->devID < 0){
float * d = (float*)tensor->data;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
for(int i = 0; i < num; i++)
d[i] = p;
#ifdef USE_CUDA
CudaSetDataFixedFloat(tensor, p);
else if(tensor->dataType == X_DOUBLE){
double p = *(double*)valuePointer;
if(tensor->devID < 0){
double * d = (double*)tensor->data;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
for(int i = 0; i < num; i++)
d[i] = p;
#ifdef USE_CUDA
CudaSetDataFixedDouble(tensor, p);
generate data items with a fixed value p (in default type)
>> tensor - the tensor whose data array would be initialized
>> p - number in default type
void SetDataFixed(XTensor &tensor, DTYPE p)
_SetDataFixed(&tensor, &p);
generate data items with a fixed value p (in integer)
>> tensor - the tensor whose data array would be initialized
>> p - an int-valued number
void _SetDataFixedInt(XTensor * tensor, int p)
CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT");
if(p == 0)
_SetDataFixed(tensor, &p);
generate data items with a fixed value p (in float)
>> tensor - the tensor whose data array would be initialized
>> p - a float-valued number
void _SetDataFixedFloat(XTensor * tensor, float p)
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_INT");
if(p == 0)
_SetDataFixed(tensor, &p);
generate data items with a fixed value p (in double)
>> tensor - the tensor whose data array would be initialized
>> p - a double-valued number
void _SetDataFixedDouble(XTensor * tensor, double p)
CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_INT");
if(p == 0)
_SetDataFixed(tensor, &p);
generate data items with a uniform distribution in [low,high]
>> tensor - the tensor whose data array would be initialized
>> low - lower value of the range
>> high - higher value of the range
void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
if(tensor == NULL)
......@@ -76,7 +216,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
SetDataRand(t2, low, high);
_SetDataRand(t2, low, high);
_CopyValues(t2, tensor);
delete t2;
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong (email: 2018-07-18
* I'm surprised that I did not write this file till today.
#include "SetData.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
set an integer data array with a fixed value p (in int)
>> d - pointer to the data array
>> size - size of the array
>> p - the initial value
void KernelSetDataFixedInt(int * d, int size, int p)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = p;
generate data items with a fixed value p (in int)
>> tensor - the tensor for initialization
>> p - the initial value
void CudaSetDataFixedInt(XTensor * tensor, int p)
CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
set a float data array with a fixed value p (in int)
>> d - pointer to the data array
>> size - size of the array
>> p - the initial value
void KernelSetDataFixedFloat(float * d, int size, float p)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = p;
generate data items with a fixed value p (in float)
>> tensor - the tensor for initialization
>> p - the initial value
void CudaSetDataFixedFloat(XTensor * tensor, float p)
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
set a double data array with a fixed value p (in int)
>> d - pointer to the data array
>> size - size of the array
>> p - the initial value
void KernelSetDataFixedDouble(double * d, int size, double p)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = p;
generate data items with a fixed value p (in double)
>> tensor - the tensor for initialization
>> p - the initial value
void CudaSetDataFixedDouble(XTensor * tensor, double p)
CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
} // namespace nts(NiuTrans.Tensor)
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* $Created by: XIAO Tong (email: 2018-07-18
* I'm surprised that I did not write this file till today.
#ifndef __SETDATA_CUH__
#define __SETDATA_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a fixed value p (in int) */
void CudaSetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
void CudaSetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void CudaSetDataFixedDouble(XTensor * tensor, double p);
} // namespace nts(NiuTrans.Tensor)
#endif // __SETDATA_CUH__
\ No newline at end of file
......@@ -28,28 +28,25 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a fixed value p */
extern "C"
void SetDataFixed(XTensor * tensor, void * valuePointer);
void _SetDataFixed(XTensor * tensor, void * valuePointer);
/* generate data items with a fixed value p (in default type) */
void SetDataFixed(XTensor &tensor, DTYPE p);
/* generate data items with a fixed value p (in int) */
extern "C"
void SetDataFixedInt(XTensor * tensor, int p);
void _SetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
extern "C"
void SetDataFixedFloat(XTensor * tensor, float p);
void _SetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
extern "C"
void SetDataFixedDouble(XTensor * tensor, double p);
void _SetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a uniform distribution in [low,high] */
extern "C"
void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
/* generate data items with a normal distribution with specified mean and standard deviation */
extern "C"
void SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
} // namespace nts(NiuTrans.Tensor)
......@@ -20,6 +20,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "Log.h"
#include "Log.cuh"
#include <math.h>
......@@ -27,22 +28,55 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its log value
>> a - the tensor we are processing
set every entry to its log value (do it on site)
>> a - input tensor we are processing
>> b - output tensor we are processing
void _Log(XTensor * a)
void _Log(const XTensor * a, XTensor * b)
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaLog(a, b);
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)log(d[i]);
db[i] = (DTYPE)log(d[i]);
set every entry to its log value
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
void _LogMe(XTensor * a)
_Log(a, a);
set every entry to its log value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the log value of the input tensor
XTensor Log(const XTensor & a)
XTensor b(&a);
/* call _Log function */
_Log(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_LOG);
return b;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
set each entry to its log value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelLog(DTYPE * d, int size)
void KernelLog(DTYPE * a, DTYPE * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = log(d[i]);
b[i] = log(a[i]);
set each entry to its log value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
void KernelLog(__half * d, int size)
void KernelLog(__half * a, __half * b, int size)
set each entry to its log value
>> a - the tensor
>> a - input tensor
>> b - output tensor
extern "C"
void _CudaLog(XTensor * a)
void _CudaLog(const XTensor * a, XTensor * b)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -74,10 +78,10 @@ void _CudaLog(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelLog << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelLog << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
else if (a->dataType == X_FLOAT16) {
KernelLog << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelLog << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
else {
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its log value (CUDA Kernel) */
void KernelLog(DTYPE * d, int size);
void KernelLog(DTYPE * a, DTYPE * b, int size);
/* set each entry to its log value (CUDA Kernel) with float16 data type*/
void KernelLog(__half * d, int size);
void KernelLog(__half * a, __half * b, int size);
/* set each entry to its log value */
extern "C"
void _CudaLog(XTensor * a);
void _CudaLog(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its log value */
extern "C"
void _Log(XTensor * a);
void _Log(const XTensor * a, XTensor * b);
set every entry to its log value (do it on site)
keep the result in the input tensor a and return nothing
void _LogMe(XTensor * a);
set every entry to its log value (return a XTensor structure)
make a new tensor to keep the result and return it
XTensor Log(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......@@ -21,6 +21,7 @@
#include <math.h>
#include "../../XTensor.h"
#include "../../XName.h"
#include "Power.h"
#include "Power.cuh"
......@@ -28,38 +29,73 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
get the power(a, p)
>> a - the tensor
>> p - as it is
>> a - input tensor
>> b - output tensor
>> p - parameter
void _Power(XTensor * a, DTYPE p)
void _Power(const XTensor * a, XTensor * b, DTYPE p)
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaPower(a, p);
_CudaPower(a, b, p);
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * aData = (DTYPE*)a->data;
DTYPE * bData = (DTYPE*)b->data;
if (p == 0) {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)1.0;
bData[i] = (DTYPE)1.0;
else if (p == (DTYPE)0.5) {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)sqrt(d[i]);
bData[i] = (DTYPE)sqrt(aData[i]);
else if (p == (DTYPE)2.0) {
for (int i = 0; i < a->unitNum; i++)
d[i] = d[i] * d[i];
bData[i] = aData[i] * aData[i];
else {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)pow(d[i], p);
bData[i] = (DTYPE)pow(aData[i], p);
get the power(a, p) (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor
>> p - parameter
void _PowerMe(XTensor * a, DTYPE p)
_Power(a, a, p);
get the power(a, p) (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor
>> p - parameter
<< return - the power value of the input tensor
XTensor Power(const XTensor & a, DTYPE p)
XTensor b(&a);
/* call _Power function */
_Power(&a, &b, p);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_POWER);
XLink::AddParamToHead(&b, p);
return b;
} // namespace nts(NiuTrans.Tensor)
......@@ -21,6 +21,7 @@
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../movement/CopyValues.cuh"
#include "Power.h"
#include "Power.cuh"
......@@ -30,74 +31,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set all entries to its root (CUDA Kernel)
>> d - data array
>> a - input data array
>> b - output data array
>> size - size of the data array
void KernelSqrtV2(DTYPE * d, int size)
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = sqrt(d[i]);
b[i] = sqrt(a[i]);
set all entries to its root (CUDA Kernel)
>> d - data array
>> a - input data array
>> b - output data array
>> size - size of the data array
void KernelSqrtV2(__half * d, int size)
void KernelSqrtV2(__half * a, __half * b, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
d[i] = hsqrt(d[i]);
b[i] = hsqrt(a[i]);
if (i < size)
d[i] = __float2half(sqrt(__half2float(d[i])));
b[i] = __float2half(sqrt(__half2float(a[i])));
get power(d[i], p)
>> d - data array
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
void KernelPower(DTYPE * d, DTYPE p, int size)
void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = pow(d[i], p);
b[i] = pow(a[i], p);
get power(d[i], p)
>> d - data array
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
void KernelPower(__half * d, __half p, int size)
void KernelPower(__half * a, __half * b, __half p, int size)
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = __float2half(pow(__half2float(d[i]), __half2float(p)));
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
/* get the power of the entries */
extern "C"
void _CudaPower(XTensor * a, DTYPE p)
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
int gridSize[3];
int blockSize[3];
......@@ -111,15 +118,18 @@ void _CudaPower(XTensor * a, DTYPE p)
if (a->dataType == DEFAULT_DTYPE) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
else if (p == (DTYPE)1.0) {
_CudaCopyValues(a, b);
else if (p != (DTYPE)1.0) {
KernelPower << <blocks, threads >> >((DTYPE*)a->data, p, a->unitNum);
KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
else if (a->dataType == X_FLOAT16) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
else if (p != (DTYPE)1.0) {
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set all entries to its root (CUDA Kernel) */
void KernelSqrtV2(DTYPE * d, int size);
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);
/* set all entries to its root (CUDA Kernel) */
void KernelSqrtV2(__half * d, int size);
void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */
extern "C"
void _CudaPower(XTensor * a, DTYPE p);
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the power(x, y) */
extern "C"
void _Power(XTensor * a, DTYPE p);
void _Power(const XTensor * a, XTensor * b, DTYPE p);
get the power(x, y) (do it on site)
keep the result in the input tensor a and return nothing
void _PowerMe(XTensor * a, DTYPE p);
get the power(x, y) (return a XTensor structure)
make a new tensor to keep the result and return it
XTensor Power(const XTensor & a, DTYPE p);
} // namespace nts(NiuTrans.Tensor)
......@@ -110,8 +110,7 @@ make a new tensor to keep the result and return it
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
CheckNTErrors(&s, "Empty input tensor!");
CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
......@@ -123,16 +122,13 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
dimSize[i] = s.dimSize[i];
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
/* destroy variables */
delete[] dimSize;
/* tensor connection */
XLink::AddParamToHeadInt(&t, dim);
......@@ -141,6 +137,9 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
XLink::AddParamToHeadPointer(&t, tgtIndex);
XLink::AddParamToHeadInt(&t, copyNum);
/* destroy variables */
delete[] dimSize;
return t;
......@@ -101,32 +101,31 @@ make a new tensor to keep the result and return it
XTensor ReduceMax(const XTensor &input, int dim)
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceMax function */
_ReduceMax(&input, &output, dim);
/* destroy variables */
delete[] dimSize;
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
XLink::AddParamToHeadInt(&output, dim);
/* destroy variables */
delete[] dimSize;
return output;
......@@ -58,20 +58,19 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
XTensor ReduceMean(const XTensor &input, int dim)
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceMean function */
......@@ -214,20 +214,19 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceSum function */
......@@ -237,6 +236,53 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, power);
XLink::AddParamToHeadBool(&output, isExp);
/* destroy variables */
delete[] dimSize;
return output;
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it
For a 1-dimensional data array a,
sum = \sum_i (a_i)^power if isExp == false
sum = \sum_i exp((a_i)^power) if isExp == true
>> input - the input tensor
>> dim - the dimension where the reduction is performed on
>> ieExp - specify if the exp() is performed
>> power - we perform pow(item_i, power) on each item in the array
<< return - the sum along a dimension of the tensor
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceSum function */
_ReduceSum(&input, &output, dim, NULL, power, isExp);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, power);
XLink::AddParamToHeadBool(&output, isExp);
/* destroy variables */
delete[] dimSize;
......@@ -43,7 +43,16 @@ For a 1-dimensional data array a,
sum = \sum_i (a_i - shift) if isExp == false
sum = \sum_i exp(a_i - shift) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it
For a 1-dimensional data array a,
sum = \sum_i (a_i) if isExp == false
sum = \sum_i exp(a_i) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);
} // namespace nts(NiuTrans.Tensor)
......@@ -54,20 +54,19 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceSumSquared function */
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: 2018-04-24
#include "../../XName.h"
#include "../math/ScaleAndShift.h"
#include "ReduceSum.h"
#include "ReduceVariance.h"
......@@ -56,25 +57,28 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
/* call _ReduceVariance function */
_ReduceVariance(&input, &output, dim, &mean);
/* tensor connection */
XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
XLink::AddParamToHeadInt(&output, dim);
/* destroy variables */
delete[] dimSize;
......@@ -68,8 +68,7 @@ or "Merge" by means of the tensor shapes
XTensor Concatenate(const XList &smalls, int dim)
CheckNTErrors(&smalls != NULL, "Invalid list!");
CheckNTErrors((smalls.count > 0), "Empty list!");
CheckNTErrors(smalls.count > 0, "Empty list!");
CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
bool uniform = true;
......@@ -80,40 +79,35 @@ XTensor Concatenate(const XList &smalls, int dim)
if (!XTensor::IsIdentical(a, b))
uniform = false;
int * dimSize;
if (uniform) {
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
dimSize = new int[order];
int * dimSize = new int[order];
if (uniform) {
for (int i = 0; i < tensor->order; i++) {
if (i != dim)
dimSize[i] = tensor->dimSize[i];
dimSize[i] = tensor->dimSize[dim] * smalls.count;
XTensor big = XTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
/* call _Merge function */
_Merge(&smalls, &big, dim);
///* tensor connection */
//XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
//XLink::AddParamToHead(&big, dim);
/* tensor connection */
XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
XLink::AddParamToHeadInt(&big, dim);
/* destroy variables */
delete dimSize;
delete[] dimSize;
return big;
else {
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
dimSize = new int[order];
for (int i = 0; i < tensor->order; i++)
if (i != dim)
dimSize[i] = tensor->dimSize[i];
......@@ -125,15 +119,19 @@ XTensor Concatenate(const XList &smalls, int dim)
dimSize[dim] = catDimSize;
XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
/* call _ConcatenateSolely function */
_ConcatenateSolely(&smalls, &big, dim);
/* tensor connection */
XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
XLink::AddParamToHeadInt(&big, dim);
/* destroy variables */
delete dimSize;
delete[] dimSize;
return big;
......@@ -168,12 +166,76 @@ make a new tensor to keep the result and return it.
XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
XList smalls(2);
/* call Concatenate function */
return Concatenate(smalls, dim);
bool uniform = true;
for (int i = 1; i < smalls.count; i++) {
XTensor * a = (XTensor*)smalls.Get(i - 1);
XTensor * b = (XTensor*)smalls.Get(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b))
uniform = false;
XTensor * tensor = (XTensor*)smalls.Get(0);
int order = tensor->order;
int * dimSize = new int[order];
if (uniform) {
for (int i = 0; i < tensor->order; i++) {
if (i != dim)
dimSize[i] = tensor->dimSize[i];
dimSize[i] = tensor->dimSize[dim] * smalls.count;
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
/* call _Merge function */
_Merge(&smalls, &big, dim);
/* tensor connection */
XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
XLink::AddParamToHeadInt(&big, dim);
/* destroy variables */
delete[] dimSize;
return big;
else {
for (int i = 0; i < tensor->order; i++)
if (i != dim)
dimSize[i] = tensor->dimSize[i];
int catDimSize = 0;
for (int i = 0; i < smalls.count; i++) {
XTensor * tensor = (XTensor*)smalls.Get(i);
catDimSize += tensor->dimSize[dim];
dimSize[dim] = catDimSize;
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
/* call _ConcatenateSolely function */
_ConcatenateSolely(&smalls, &big, dim);
/* tensor connection */
XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
XLink::AddParamToHeadInt(&big, dim);
/* destroy variables */
delete[] dimSize;
return big;
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -36,7 +36,7 @@ concatenate a list of tensors along a given dimension
void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0;
int dimRDI = big->order - dim - 1;
......@@ -30,8 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */
extern "C"
void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge,
void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum);
#endif // USE_CUDA
......@@ -161,8 +161,7 @@ e.g., (N/3, M, 3) -> (N, M)
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
CheckNTErrors(&s != NULL, "Invalid tensors!");
CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
if (leadingDim < 0)
leadingDim = 0;
......@@ -180,13 +179,18 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
/* call _Merge function */
_Merge(&s, &t, whereToMerge, leadingDim);
/* tensor connections */
XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
XLink::AddParamToHeadInt(&t, whereToMerge);
XLink::AddParamToHeadInt(&t, leadingDim);
/* destroy variables */
delete[] dimSize;
......@@ -327,13 +331,58 @@ XTensor Merge(const XList &smalls, int whereToMerge)
dimSize[i] = tensor->dimSize[whereToMerge] * smalls.count;
XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
/* call _Merge function */
_Merge(&smalls, &big, whereToMerge);
/* tensor connections */
XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
XLink::AddParamToHeadInt(&big, whereToMerge);
/* destroy variables */
delete[] dimSize;
return big;
merge two tensors into a big tensor (return a XTensor structure)
>> smalls - the list of the small tensors
>> whereToMerge - the merging operation is along with which dimension
<< return - the big tensor merged by small tensors
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
CheckNTErrors(XTensor::IsIdentical(&smallA, &smallB),
"The two tensors must be of the same size!");
int order = smallA.order;
int * dimSize = new int[order];
for (int i = 0; i < smallA.order; i++) {
if (i != whereToMerge)
dimSize[i] = smallA.dimSize[i];
dimSize[i] = smallA.dimSize[whereToMerge] * 2;
float dr = (!smallA.isSparse) ? 1.0F : smallA.denseRatio;
XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
XList smalls(2);
/* call _Merge function */
_Merge(&smalls, &big, whereToMerge);
/* tensor connections */
XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
XLink::AddParamToHeadInt(&big, whereToMerge);
/* destroy variables */
delete[] dimSize;
......@@ -29,22 +29,19 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */
void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
transform a tensor by merging it alone with a dimension (return a XTensor structure).
make a new tensor to keep the result and return it.
e.g., (M, N/3, 3) -> (M, N)
/* transform a tensor by merging it alone with a dimension (return a XTensor structure)
e.g., (M, N/3, 3) -> (M, N) */
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);
/* merge small tensors into a big tensor */
void _Merge(const XList * smalls, XTensor * big, int whereToMerge);
merge small tensors into a big tensor (return a XTensor structure).
make a new tensor to keep the result and return it.
/* merge small tensors into a big tensor (return a XTensor structure) */
XTensor Merge(const XList &smalls, int whereToMerge);
/* merge two tensors into a big tensor (return a XTensor structure) */
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
} // namespace nts(NiuTrans.Tensor)
#endif // __MERGE_H__
\ No newline at end of file
......@@ -19,10 +19,12 @@
* $Created by: XIAO Tong (email: 2018-04-24
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "Split.h"
#include "MakeSplitBlockIndex.h"
#include "../../XName.h"
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "../movement/CopyBlocksOnSite.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -146,21 +148,26 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
int order = s.order + 1;
int * dimSize = new int[order];
dimSize[0] = splitNum;
for (int i = 0; i < s.order; i++) {
if (i == whereToSplit)
dimSize[i] = s.dimSize[i] / splitNum;
dimSize[i+1] = s.dimSize[i] / splitNum;
dimSize[i] = s.dimSize[i];
dimSize[i+1] = s.dimSize[i];
dimSize[-1] = splitNum;
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
/* call _Split function */
_Split(&s, &t, whereToSplit, splitNum);
/* tensor connections */
XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
XLink::AddParamToHeadInt(&t, whereToSplit);
XLink::AddParamToHeadInt(&t, splitNum);
/* destroy variables */
delete[] dimSize;
......@@ -168,7 +175,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
split a big tensor into small tensors.
split a big tensor into small tensors
>> big - the source tensor
>> smalls - the list that keeps the resulting tensors (for return)
......@@ -274,43 +281,29 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
split a big tensor into small tensors (returna a XList struture).
make a new list to keep the result and return it.
split a big tensor into small tensors
>> big - the source tensor
>> smalls - the list that keeps the resulting tensors (for return)
NOTE that all the "small" tensors have already been placed in the list in advance.
>> whereToSplit - which dimension of the tensor is to split
>> splitNum - how many splits
<< return - a list of small tensors by splitting a big tensor
XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
CheckNTErrors(&big, "Invalid tensors!");
XList smalls = XList(splitNum);
int order = big.order;
int * dimSize = new int[order];
for (int i = 0; i < big.order; i++) {
if (i != whereToSplit)
dimSize[i] = big.dimSize[i];
dimSize[i] = big.dimSize[i] / splitNum;
for (int i = 0; i < splitNum; i++) {
XTensor tensor = NewTensor(order, dimSize, big.dataType, big.denseRatio, big.devID, big.mem);
/* call _Split function */
_Split(&big, &smalls, whereToSplit, splitNum);
/* destroy variables */
delete[] dimSize;
/* tensor connections */
for(int i = 0; i < smalls.count; i++){
XTensor * s = (XTensor*)smalls.Get(i);
XLink::MakeLink(&big, NULL, s, SHAPE_SPLIT_LIST);
XLink::AddParamToHeadInt(s, whereToSplit);
return smalls;
/* it is tricky here that we keep the id of each
block, rather than the total number of splits */
XLink::AddParamToHeadInt(s, i);
} // namespace nts(NiuTrans.Tensor)
......@@ -46,7 +46,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
split a big tensor into small tensors (return a XList structure)
make a new list to keep the result and return it
XList SplitList(const XTensor &big, int whereToSplit, int splitNum);
void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum);
} // namespace nts(NiuTrans.Tensor)
......@@ -108,8 +108,6 @@ make a new tensor to keep the result and return it
XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
CheckNTErrors(&a, "Empty input tensors!");
int order = a.order + 1;
int * dimSize = new int[order];
......@@ -122,13 +120,18 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
dimSize[i] = a.dimSize[i - 1];
XTensor b = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
/* call _Unsqueeze function */
_Unsqueeze(&a, &b, dim, dSize);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&b, dSize);
/* destroy variables */
delete[] dimSize;
......@@ -29,13 +29,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
sort the tensor along a given dimension
>> a - the tensor
>> a - input tensor
>> b - output tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
void _Sort(XTensor * a, XTensor * index, int dim)
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
......@@ -46,7 +47,7 @@ void _Sort(XTensor * a, XTensor * index, int dim)
if (a->devID >= 0) {
#ifdef USE_CUDA
_CudaSortBig(a, a, index, index, dim);
_CudaSortBig(a, b, index, index, dim);
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
......@@ -64,12 +65,13 @@ void _Sort(XTensor * a, XTensor * index, int dim)
for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < stride; i++) {
void * data = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
/* we sort the data array along "dim" */
if (a->dataType == X_FLOAT)
XQSort(data, indexData, strideNum, a->unitSize, stride, CompXFloat);
XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
else {
......@@ -78,4 +80,40 @@ void _Sort(XTensor * a, XTensor * index, int dim)
sort the tensor along a given dimension (do it on site)
keep the result in the input tensor a and return nothing
>> a - input tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
void _SortMe(XTensor * a, XTensor * index, int dim)
_Sort(a, a, index, dim);
sort the tensor along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor
>> b - output tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim)
/* call _Negate function */
_Sort(&a, &b, &index, dim);
/* tensor connections */
XList list(2);
XLink::MakeLink(&a, &list, SORT_SORT);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, dim);
} // namespace nts(NiuTrans.Tensor)
......@@ -39,7 +39,7 @@ bitonic sort (for each row in a matrix)
>> n - row number of the matrix
template<class T> __global__
void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -74,7 +74,7 @@ bitonic sort (for each row in a matrix) with index
>> n - row number of the matrix
template<class T> __global__
void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -27,8 +27,20 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sort the data along a given dimension */
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim);
sort the data along a given dimension (do it on site)
keep the result in the input tensor a and return nothing
void _SortMe(XTensor * a, XTensor * index, int dim);
sort the data along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
extern "C"
void _Sort(XTensor * a, XTensor * index, int dim);
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
} // namespace nts(NiuTrans.Tensor)
......@@ -105,4 +105,29 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
get the top-k items along a given dimension
>> a - input tensor
>> b - output tensor (top-k result)
>> index - index of the top-k items
>> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
_TopK(&a, &b, &index, dim, k);
/* tensor connection */
XList list(2);
XLink::MakeLink(&a, &list, SORT_TOPK);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, k);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, k);
} // namespace nts(NiuTrans.Tensor)
......@@ -30,6 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
extern "C"
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/* get the top-k items along a given dimension */
extern "C"
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
} // namespace nts(NiuTrans.Tensor)
#endif // __TOPK_H__
\ No newline at end of file
......@@ -20,6 +20,7 @@
#include <stdlib.h>
#include "../XName.h"
#include "HardTanH.h"
#include "HardTanH.cuh"
......@@ -59,6 +60,30 @@ void _HardTanH(const XTensor * x, XTensor * y)
hard tanh function (return a XTensor structure)
make a new tensor to keep the result and return it
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
>> x - input tensor
<< return - y
XTensor HardTanH(const XTensor &x)
XTensor y(&x);
/* call _HardTanH function */
_HardTanH(&x, &y);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
return y;
backward computation
dE/dx = dE/dy * dy/dx
......@@ -77,7 +102,7 @@ hard tanh: y = 1 if x > 1
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
void HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
......@@ -86,7 +111,7 @@ void HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
_CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
......@@ -95,7 +120,7 @@ void HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
Markdown 格式
您添加了 0 到此讨论。请谨慎行事。
注册 或者 后发表评论