Commit 2a2bd312 by 张裕浩

update files and start work

parent 30dd9d30
# NiuTrans.Tensor环境配置
## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64。
## CUDA配置
在已安装好VS、CUDA并配置好环境变量后,一些关键的CUDA配置选项如下所示,以下配置选项在 **项目 -> 属性** 中可以找到。
>$(CUDA_PATH)\include
加入到 **VC++目录 -> 包含** 中。
>$(CUDA_PATH)\lib\Win32
加入到 **VC++目录 -> 库** 中。
>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
加入到 **链接器->输入->附加依赖项** 中。
配置完成后,右键 **工程->项目依赖性** ,选择CUDA9。
在.cu文件上右键属性,在项类型中选择"CUDA C/C++"(最好搜索.cu文件,然后全选设置)。
## 其他配置
**C/C++->常规->SDL检查**,设为否。
**C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE;
**链接器->系统->子系统**,设置为控制台。
**常规->字符集**,使用Unicode字符集。
**调试->命令参数**中设置可执行文件所需要的参数。
......@@ -21,95 +21,275 @@
#include <stdio.h>
#include "XNet.h"
#include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h"
#include "../sample/fnnlm/FNNLM.h"
#include "../tensor/test/Test.h"
#include <cuda_runtime.h>
#include <time.h>
#include <windows.h>
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
//#include "../tensor/timer.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
void SplitBackwardTest();
void MemTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
void SetDataTest()
void test()
{
int * dimSize = new int[2];
dimSize[0] = 10000;
dimSize[1] = 1000;
XTensor b1(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
XTensor b2(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
XTensor b3(2, dimSize, X_FLOAT, 1.0F, -1, NULL);
DWORD m_start_time;
DWORD m_end_time;
double time_diff = 0.0;
m_start_time = GetTickCount();
_SetDataRand(&b1, -2.0F, 2.0F);
cudaThreadSynchronize();
m_end_time = GetTickCount();
time_diff = m_end_time - m_start_time;
printf("time %f ms\n", time_diff);
m_start_time = GetTickCount();
_SetDataRand(&b3, -2.0F,2.0F);
cudaThreadSynchronize();
m_end_time = GetTickCount();
time_diff = m_end_time - m_start_time;
printf("time %f ms\n", time_diff);
XTensor a;
InitTensor2D(&a, 100, 100, X_FLOAT, 0);
XTensor b;
InitTensor2D(&b, 100, 100, X_FLOAT16, 0);
_ConvertDataType(&a, &b);
return;
}
int main( int argc, const char ** argv )
{
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
//SetDataTest();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
//timer_c asd;
test();
//MemTest();
//return 0;
//SplitBackwardTest();
//return 0;
//_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
/*if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
/*else{
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
}
}*/
//_CrtDumpMemoryLeaks();
return 0;
}
void BackwardTest()
{
XNet net;
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 4);
InitTensor2D(&c, 2, 4);
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll();
b.SetZeroAll();
c.SetZeroAll();
SetDataFixed(a, 0.1F);
a.Set2D(0.3F, 1, 0);
a.Set2D(0.4F, 1, 1);
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
b = Merge(a, a, 1);
c = HTanH(MMul(a, b));
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c);
//XLink::ShowNetwork(stderr, &c);
net.Backward(c);
net.Dump(stderr);
}
void TransposeTest()
{
#ifdef USE_CUDA
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x;
XTensor y;
XTensor z;
int loops = 2000;
int B = 3 * 2 * 4;
int K = 8 * 1;
int N = 50;
int H = 512 * 4;
int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
InitTensor3D(&z, B, N, H, X_FLOAT, 0);
cudaEvent_t ctime0;
cudaEvent_t ctime1;
cudaEvent_t ctime2;
cudaEvent_t ctime3;
cudaEvent_t ctime4;
cudaEvent_t ctime5;
float elapsedSplit = 0.0;
float elapsedMerge = 0.0;
float elapsedSum = 0.0;
cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3);
cudaEventCreate(&ctime4);
cudaEventCreate(&ctime5);
cudaEventRecord(ctime0, 0);
double time0 = GetClock();
for(int i = 0; i < loops; i++)
_Split(&x, &y, 2, K);
double time1 = GetClock();
//_CrtDumpMemoryLeaks();*/
cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0);
double time2 = GetClock();
for(int i = 0; i < loops; i++)
_Merge(&y, &x, 3);
double time3 = GetClock();
cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
cudaEventRecord(ctime4, 0);
double time4 = GetClock();
for(int i = 0; i < loops; i++)
_Sum(&x, &z, &x);
double time5 = GetClock();
cudaEventRecord(ctime5, 0);
cudaEventSynchronize(ctime5);
cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
#endif
}
void SumDimTest()
{
XTensor x;
XTensor y;
XTensor z;
int a = 5;
int b = 7;
int c = 3;
InitTensor3D(&x, a, b, c, X_FLOAT, -1);
InitTensor1D(&y, c, X_FLOAT, -1);
InitTensor3D(&z, a, b, c, X_FLOAT, -1);
x.SetZeroAll();
y.SetZeroAll();
z.SetZeroAll();
DTYPE * data = new DTYPE[x.unitNum];
for(int i = 0; i < x.unitNum; i++)
data[i] = (DTYPE)i;
x.SetData(data, x.unitNum);
for(int i = 0; i < y.unitNum; i++)
data[i] = -(DTYPE)i;
y.SetData(data, y.unitNum);
_SumDim(&x, &y, &z, 2);
z.Dump(stderr, "z:");
delete[] data;
}
void SplitBackwardTest()
{
int * dimSize = new int[2];
dimSize[0] = 2;
dimSize[1] = 4;
XTensor t1;
InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
XTensor t2;
InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
XTensor tensor;
return 0;
//_SetDataFixedFloat(&t1, 1.0F);
//_SetDataFixedFloat(&t2, 2.0F);
t1.SetDataRand();
t2.SetDataRand();
tensor = t1 + t2;
XList smalls;
XTensor first;
XTensor second;
InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
smalls.Add(&first);
smalls.Add(&second);
Split(tensor, smalls, 1, 2);
XTensor mul;
mul = Sum(first, second);
XNet net;
net.Backward(mul);
net.Dump(stderr);
printf("Done!");
}
void MemTest()
{
XMem * mem;
mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
XTensor tensor;
InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
tensor.SetZeroAll();
tensor.Dump(stderr);
delete mem;
if (tensor.mem != NULL) {
printf("It isn't null!\n");
printf("%d\n", (int)tensor.mem->signature);
}
else {
printf("It's null\n");
}
tensor.Dump(stderr);
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* backward computation for data operation
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
*/
#include "XNoder.h"
#include "XBackwardData.h"
#include "../tensor/XName.h"
#include "../tensor/XUtility.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/core/getandset/SetData.h"
namespace nts{
/* compute dE/dx of a node */
void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
{
CheckNTErrors(node->grad != NULL, "No gradient found!");
XLink &income = node->income;
int operID = income.typeID;
if(operID == GETANDSET_CONVERTDATATYPE)
GradConvertDataType(node, isEfficent);
else if(operID == GETANDSET_INDEXTOONEHOT)
GradIndexToOnehot(node, isEfficent);
else if(operID == GETANDSET_ONEHOTTOINDEX)
GradOnehotToIndex(node, isEfficent);
else{
ShowNTErrors("TODO!");
}
}
/* indicates whether the node is for a data operation */
bool XDataGrad::IsDataOP(XTensor * node)
{
XLink &income = node->income;
return (income.typeID & DATA_BASE) != 0;
}
/*
gradient computation for convert datatype
for
b = converdatatype(a)
we have
dE/da = convertdatatype(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
_ConvertDataType(node->grad, input->grad);
}
/*
gradient computation for OnehotToIndex
for
b = OnehotToIndex(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
}
/*
gradient computation for IndexToOnehot
for
b = IndexToOnehot(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* backward computation for data operation
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
*/
#include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h"
#ifndef __XBACKWARDDATA_H__
#define __XBACKWARDDATA_H__
namespace nts{
/* this class computes the gradient for tensor data operation given a node */
class XDataGrad
{
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficent);
/* indicates whether the node is for a shaping operation */
static
bool IsDataOP(XTensor * node);
private:
/* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
static
void GradConvertDataType(XTensor * node, bool isEfficent);
/* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
static
void GradIndexToOnehot(XTensor * node, bool isEfficent);
/* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
static
void GradOnehotToIndex(XTensor * node, bool isEfficent);
};
} // namespace nts(NiuTrans.Tensor)
#endif
\ No newline at end of file
......@@ -29,10 +29,8 @@
namespace nts{
/* compute dE/dx of a node */
void XFuncGrad::MakeGrad(XTensor * node)
void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
int operID = income.typeID;
......@@ -51,7 +49,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
......@@ -60,7 +58,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else{
ShowNTErrors("Wrong activation function type!");
......
......@@ -35,7 +35,7 @@ class XFuncGrad
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node);
void MakeGrad(XTensor * node, bool isEfficient);
/* indicates whether the node is for an activation function */
static
......
......@@ -21,8 +21,13 @@
#include "XBackwardLoss.h"
#include "../tensor/XName.h"
#include "../tensor/core/getandset/SetData.h"
#include "../tensor/function/HardTanH.h"
#include "../tensor/function/Identity.h"
#include "../tensor/function/LogSoftmax.h"
#include "../tensor/function/Rectify.h"
#include "../tensor/function/Sigmoid.h"
#include "../tensor/function/Softmax.h"
namespace nts{
......@@ -38,7 +43,7 @@ compute dE/dx for a given function y = f(x)
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName)
{
......@@ -49,9 +54,21 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_IDENTITY){
_IdentityBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
_LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_SIGMOID){
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
......@@ -67,10 +84,26 @@ compute dE/dy for variable y and error(loss) function E
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
_LossBackward(dedy, gold, y, lossName);
if(gold == NULL){
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return;
}
//_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
}
}
\ No newline at end of file
......@@ -36,13 +36,13 @@ class XLossGrad
public:
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName);
};
......
......@@ -33,116 +33,131 @@ class XMathGrad
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node);
void MakeGrad(XTensor * node, bool isEfficient);
/* indicates whether the node is for a math operation */
static
bool IsMathOP(XTensor * node);
private:
/* gradient for sum: c = a + b * \beta */
/* gradient for absolute */
static
void GradSum(XTensor * node);
/* gradient for sum with one dimension: c = a + b * \beta
where the size of b is equal to that of one dimension of a */
void GradAbsolute(XTensor * node, bool isEfficient);
/* gradient for cos */
static
void GradSumDim(XTensor * node);
/* gradient for multiply (dot production): c = a * b * \alpha */
void GradCos(XTensor * node, bool isEfficient);
/* gradient for exp */
static
void GradMultiply(XTensor * node);
void GradExp(XTensor * node, bool isEfficient);
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */
/* gradient for log: c = log(a) */
static
void GradMatrixMul(XTensor * node);
void GradLog(XTensor * node, bool isEfficient);
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */
/* gradient for round */
static
void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
XTensor * dedc, DTYPE alpha);
/* gradient for matrix multiply in batch mode.
for each batch: c_i = matmul(a_i, b_i) * \alpha */
void GradRound(XTensor * node, bool isEfficient);
/* gradient for sign */
static
void GradMatrixMulBatched(XTensor * node);
void GradSign(XTensor * node, bool isEfficient);
/* gradient for log: c = log(a) */
/* gradient for sin */
static
void GradLog(XTensor * node);
void GradSin(XTensor * node, bool isEfficient);
/* gradient for power */
/* gradient for tan */
static
void GradPower(XTensor * node);
void GradTan(XTensor * node, bool isEfficient);
/* gradient for negate */
/* gradient for clip */
static
void GradNegate(XTensor * node);
void GradClip(XTensor * node, bool isEfficient);
/* gradient for ScaleAndShift */
/* gradient for Divide */
static
void GradScaleAndShift(XTensor * node);
void GradDiv(XTensor * node, bool isEfficient);
/* gradient for Minus */
/* gradient for DivideDim */
static
void GradSub(XTensor * node);
void GradDivDim(XTensor * node, bool isEfficient);
/* gradient for Divide */
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */
static
void GradDiv(XTensor * node);
void GradMatrixMul(XTensor * node, bool isEfficient);
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */
static
void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
XTensor * dedc, DTYPE alpha, bool isEfficient);
/* gradient for reduceMean */
/* gradient for matrix multiply in batch mode.
for each batch: c_i = matmul(a_i, b_i) * \alpha */
static
void GradReduceMean(XTensor * node);
void GradMatrixMulBatched(XTensor * node, bool isEfficient);
/* gradient for reduceSum */
/* gradient for multiply (dot production): c = a * b * \alpha */
static
void GradReduceSum(XTensor * node);
void GradMultiply(XTensor * node, bool isEfficient);
/* gradient for reduceSumSquared */
/* gradient for multiply one dimension: c = a * b * \alpha
where the size of b is equal to that of one dimension of a */
static
void GradReduceSumSquared(XTensor * node);
void GradMultiplyDim(XTensor * node, bool isEfficient);
/* gradient for reduceVariance */
/* gradient for negate */
static
void GradReduceVariance(XTensor * node);
void GradNegate(XTensor * node, bool isEfficient);
/* gradient for normalize */
static
void GradNormalize(XTensor * node, bool isEfficient);
/* gradient for sin */
/* gradient for power */
static
void GradSin(XTensor * node);
void GradPower(XTensor * node, bool isEfficient);
/* gradient for cos */
/* gradient for ScaleAndShift */
static
void GradCos(XTensor * node);
void GradScaleAndShift(XTensor * node, bool isEfficient);
/* gradient for tan */
/* gradient for Minus */
static
void GradTan(XTensor * node);
void GradSub(XTensor * node, bool isEfficient);
/* gradient for sub with one dimension: c = a - b * \beta
where the size of b is equal to that of one dimension of a */
static
void GradSubDim(XTensor * node, bool isEfficient);
/* gradient for exp */
/* gradient for sum: c = a + b * \beta */
static
void GradExp(XTensor * node);
void GradSum(XTensor * node, bool isEfficient);
/* gradient for normalize */
/* gradient for sum with one dimension: c = a + b * \beta
where the size of b is equal to that of one dimension of a */
static
void GradNormalize(XTensor * node);
void GradSumDim(XTensor * node, bool isEfficient);
/* gradient for absolute */
/* gradient for reduceMean */
static
void GradAbsolute(XTensor * node);
void GradReduceMean(XTensor * node, bool isEfficient);
/* gradient for sign */
/* gradient for reduceSum */
static
void GradSign(XTensor * node);
void GradReduceSum(XTensor * node, bool isEfficient);
/* gradient for clip */
/* gradient for reduceSumSquared */
static
void GradClip(XTensor * node);
void GradReduceSumSquared(XTensor * node, bool isEfficient);
/* gradient for round */
/* gradient for reduceVariance */
static
void GradRound(XTensor * node);
void GradReduceVariance(XTensor * node, bool isEfficient);
};
}
......
......@@ -34,7 +34,7 @@ class XShapeGrad
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node);
void MakeGrad(XTensor * node, bool isEfficent);
/* indicates whether the node is for a shaping operation */
static
......@@ -42,39 +42,52 @@ public:
/* post processing of a node */
static
void PostProcessing(XTensor * node, int typeId);
void PostProcessing(XTensor * node, int typeId, bool isEfficent);
private:
/* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
static
void GradCopyIndexed(XTensor * node, bool isEfficent);
/* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
static
void GradGather(XTensor * node, bool isEfficent);
/* gradient computation for merge: c = merge(a, b, ...) */
static
void GradMerge(XTensor * node);
void GradMerge(XTensor * node, bool isEfficent);
/* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
static
void GradMergeList(XTensor * node);
void GradMergeList(XTensor * node, bool isEfficent);
/* gradient computation for transposing a tensor : b = transpose(a) */
static
void GradTranspose(XTensor * node, bool isEfficent);
/* gradient computation for reshaping a tensor: c = reshape(a) */
static
void GradReshape(XTensor * node, bool isEfficent);
/* gradient computation for split: c = split(a) */
static
void GradSplit(XTensor * node);
void GradSplit(XTensor * node, bool isEfficent);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
static
void GradSplitList(XTensor * node);
void GradSplitList(XTensor * node, bool isEfficent);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
this method is called only when all nodes of spliting have been processed. We do this in a post-processing
manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
static
void GradSplitListPost(XTensor * node);
void GradSplitListPost(XTensor * node, bool isEfficent);
/* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
static
void GradUnsqueeze(XTensor * node);
void GradUnsqueeze(XTensor * node, bool isEfficent);
/* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
static
void GradTranspose(XTensor * node);
};
}
......
......@@ -47,6 +47,9 @@ struct XNet
/* input nodes of the network */
XList inputs;
/* indicates whether the network just keeps the gradient for parameter tensors */
bool isGradEfficient;
/* constructor */
XNet();
......@@ -56,25 +59,32 @@ struct XNet
/* clear the network */
void Clear();
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward computation for a given node */
void BackwardNode(XTensor * node);
void BackwardNode(XTensor * node, bool isEfficent = false);
/* backward computation (in post processing) for a given node */
void BackwardNodePost(XTensor * node);
void BackwardNodePost(XTensor * node, bool isEfficent = false);
/* traverse the net and find the topological order by
depth-first search (Tarjan's algorithm) */
......@@ -89,6 +99,18 @@ struct XNet
/* dump network information */
void Dump(FILE * file);
/* set the flag of gradient-efficient */
void SetGradEfficientFlag(bool flag = true);
/* generate the gradient-efficient flag for every node */
void MakeEfficientNet();
/* clear the graident information if the node is no use */
void ClearGrad(XTensor * node);
/* show network topology */
void ShowNetwork(FILE * file, XTensor * node);
};
/* we make a unique id for every tensor */
......
......@@ -74,6 +74,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
/*
entry of the program
......@@ -99,7 +100,7 @@ arguments:
(how many words)
-shuffle: shuffle the training data
-devid D: the id of the device used
-1: GPU, >=0: GPUs
-1: CPU, >=0: GPUs
-mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training
......@@ -230,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
}
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], "-mempool"))
if (!strcmp(argv[i], "-mempool"))
model.mem = new XMem(model.devID);
}
}
......@@ -476,7 +477,12 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Clear(model, true);
/* forward + backward process */
ForwardAutoDiff(inputs, output, model);
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
......@@ -508,6 +514,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
if(isEnd)
break;
Test(testFN, outputFN, model);
}
double elapsed = GetClockSec() - startT;
......@@ -707,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
>> devID - device id
>> mem - memory pool
*/
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem)
void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
int itemNum, int devID, XMem * mem)
{
if(devID >= 0 || (mem != NULL && mem->devID >= 0))
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
else
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
tensor.SetZeroAll();
/* set none-zero cells */
for(int i = 0; i < itemNum; i++)
tensor.Set2D(1.0F, rows[i], cols[i]);
if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
XList list(1);
list.Add(&tensor);
CPUToGPUFlush(&list, devID, mem);
}
}
/*
......@@ -851,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
}
}
/*
......@@ -884,7 +882,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* for y = softmax(s), we get dE/ds
where E is the error function (define by loss) */
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
/* for s = x * w, we get
dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
......@@ -975,7 +973,55 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
}
/*
forward process (with tensor connections)
forward process (with tensor connections) (this is implemented by gather function)
>> ngrams - the loaded ngrams
>> batch - the tensor encoding a batch of words
>> output - output probability
>> model - the fnn model
*/
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
int size = batch * (n-1);
int * index = new int[size];
for(int i = 0; i < batch; i++){
for (int j = 0; j < n-1; j++){
int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j];
}
}
InitTensor1D(&words, size, X_INT, model.devID, model.mem);
words.SetData(index, size);
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
int dimSize[2];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */
for(int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/*
forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations
>> output - output probability
>> model - the fnn model
......@@ -1011,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XLink::ShowNetwork(stderr, &output);
}
/*
......@@ -1122,8 +1167,12 @@ void Test(const char * test, const char * result, FNNModel &model)
/* forward computation */
Forward(inputs, output, model, net);
}
else {
ForwardAutoDiff(inputs, output, model);
else {
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in
* "A Neural Probabilistic Language Model" by Bengio et al.
* Journal of Machine Learning Research 3 (2003) 1137C1155
*
* $Created by: ZHANG Yuhao (yoohao.zhang@gmail.com) 2018-08-08
* It's my first time to write neural network, hope there will be less BUG
*/
#ifndef __MYFNNLM_H__
#define __MYFNNLM_H__
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
using namespace nts;
namespace myfnnlm {
void fnnlm();
}
#endif
\ No newline at end of file
......@@ -35,6 +35,8 @@ T2TAttention::T2TAttention()
dk = -1;
dv = -1;
d = -1;
isMasked = false;
ignored = 0;
}
/* deconstructor */
......@@ -46,13 +48,19 @@ T2TAttention::~T2TAttention()
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIgnored - number of position ignored in attention (from the begining)
>> myIsMasked - indicates whether the attention is with a mask
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
isMasked = myIsMasked;
ignored = myIgnored;
float minmax = 0;
......@@ -61,18 +69,22 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta);
}
/*
......@@ -82,9 +94,11 @@ make the network
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
*/
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
{
XTensor k2;
XTensor q2;
......@@ -105,14 +119,26 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
vheads = Split(v2, v2.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked)
dot = dot + mask;
dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
scalar = Softmax(dot, -1);
if(isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
att = BMMul(scalar, vheads);
/* concatenate the heads */
return Merge(att, att.order - 1);
return MMul(Merge(att, att.order - 1), wa);
}
}
......@@ -57,6 +57,9 @@ public:
/* transformation matrix for V */
XTensor wv;
/* transformation after dot-product attention */
XTensor wa;
/* size of transformed Q and K */
int dk;
......@@ -66,6 +69,19 @@ public:
/* size of input Q, K and V */
int d;
/* indicates whether the attention is masked */
bool isMasked;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* indicates whether the model is used for training */
bool isTraining;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TAttention();
......@@ -74,10 +90,12 @@ public:
~T2TAttention();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v);
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttDecoder::AttDecoder()
{
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor res;
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
/* layer normalization */
x = attEndeLayerNorms[i].Make(res);
/*******/
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
return x;
}
}
......@@ -22,19 +22,33 @@
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#include "T2TEncoder.h"
namespace transformer
{
class T2TDecoder
class AttDecoder : public AttEncoder
{
public:
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
};
class AttDecoder : T2TDecoder
{
/* layer normalization for encoder-decoder attention */
T2TLN * attEndeLayerNorms;
public:
/* constructor */
AttDecoder();
/* deconstructor */
~AttDecoder();
/* initialize the model */
void InitModel(int argc, const char ** argv);
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
};
}
......
......@@ -48,7 +48,7 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
......@@ -60,16 +60,18 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
}
/*
make positional embeddings (of size eSize * length
eSize - embedding size
length - length of the sequenc
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{
......@@ -79,6 +81,17 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for(int pos = 0; pos < length; pos++){
float * dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
}
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
}
/*
for(int k = 0; k < eSize; k++){
if(k % 2 == 0){
int i = k/2;
......@@ -89,6 +102,7 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
}
}
*/
}
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
......@@ -101,15 +115,18 @@ make the network
*/
XTensor T2TEmbedder::Make(XTensor &input)
{
CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
//CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize;
dims[input.order] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order);
if(match){
......@@ -120,8 +137,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
}
/* we make positional embeddings first */
if(!match){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
......@@ -130,13 +149,12 @@ XTensor T2TEmbedder::Make(XTensor &input)
DelTensorBuf(posTMP);
}
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding +posEmbedding;
return wordEmbedding + posEmbedding;
}
}
......@@ -63,9 +63,6 @@ public:
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
/* positional embeddings */
XTensor posEmbedding;
public:
/* constructor */
T2TEmbedder();
......@@ -74,7 +71,7 @@ public:
~T2TEmbedder();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length);
......
......@@ -31,6 +31,10 @@ namespace transformer
/* constructor */
AttEncoder::AttEncoder()
{
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
}
/* de-constructor */
......@@ -46,18 +50,24 @@ AttEncoder::~AttEncoder()
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void AttEncoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
......@@ -72,7 +82,7 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myDevID, myMem);
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
......@@ -82,14 +92,23 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input)
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input);
//x.Dump(tmpFILE, "embedding: ");
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ln;
......@@ -97,24 +116,28 @@ XTensor AttEncoder::Make(XTensor &input)
XTensor res;
/* self attention */
att = attentions[i].Make(x, x, x);
att = attentions[i].Make(x, x, x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* TODO: dropout */
/* layer normalization */
x = attLayerNorms[i].Make(res);
/* fnn */
fnn = fnns[i].Make(x);
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* TODO: dropout */
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
......@@ -122,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input)
return x;
}
/*
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
}
......@@ -40,7 +40,7 @@ class T2TEncoder
{
public:
virtual
XTensor Make(XTensor &input) = 0;
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
};
/*
......@@ -49,7 +49,7 @@ the encoder based on RNN
class RNNEncoder : T2TEncoder
{
public:
XTensor Make(XTensor &input);
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
};
......@@ -77,6 +77,13 @@ public:
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
......@@ -106,10 +113,15 @@ public:
~AttEncoder();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the encoding network */
XTensor Make(XTensor &input);
XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
};
......
......@@ -49,7 +49,7 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
......@@ -58,8 +58,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
......@@ -83,12 +84,15 @@ y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor &input)
XTensor T2TFNN::Make(XTensor &input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MMul(input, w1) + b1);
if(isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
return MMul(t1, w2) + b2;
......
......@@ -59,6 +59,9 @@ public:
/* bias of transformation 2 */
XTensor b2;
/* dropout probability */
DTYPE dropoutP;
public:
......@@ -69,10 +72,10 @@ public:
~T2TFNN();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &input);
XTensor Make(XTensor &input, bool isTraining);
};
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
......@@ -26,12 +27,13 @@
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
devID = -1;
mem = NULL;
mem = NULL;
d = 0;
}
/* de-constructor */
......@@ -46,28 +48,25 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
int d = 0;
d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
InitTensor1D(&w, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / (d + d));
w.SetDataRand(-finfout, finfout);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
}
/*
make the network
make the network
for each layer representation x, we have
y =
y =
>> input - the input tensor
>> return - layer normalization output
*/
......@@ -90,16 +89,16 @@ XTensor T2TLN::Make(XTensor &input)
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
/* unsqueeze mean and standard deviation to fit them into
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
xn = (x - meanFilled)/standardFilled ;
xn = (x - meanFilled) / standardFilled;
/* result = x' * w + b */
return MMul(xn, w) + b;
return xn * w + b;
}
}
......@@ -45,6 +45,9 @@ public:
/* the bias term b */
XTensor b;
/* dimension size of the model */
int d;
public:
/* constructor */
......@@ -54,7 +57,7 @@ public:
~T2TLN();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &input);
......
......@@ -41,13 +41,13 @@ public:
XMem * mem;
/* the encoder */
AttEncoder encoder;
AttEncoder * encoder;
/* the decoder */
AttDecoder decoder;
AttDecoder * decoder;
/* output layer */
T2TOutput outputLayer;
T2TOutput * outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
......@@ -55,6 +55,9 @@ public:
/* indicates whether the model is running for machine translation */
bool isMT;
/* number of heads in the attention model */
int nhead;
public:
/* constructor */
T2TModel();
......@@ -63,15 +66,30 @@ public:
~T2TModel();
/* initialize the model */
void InitModel(int argc, const char ** argv);
void InitModel(int argc, char ** argv);
/* make the encoding network */
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the encoding network */
XTensor MakeEncoding(XTensor &input);
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
/* get parameter matrics */
void GetParams(XList &list);
/* dump the parameters */
void Dump(const char * fn);
/* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output);
/* read the parameters */
void Read(const char * fn);
};
}
#endif
\ No newline at end of file
#endif
......@@ -49,7 +49,7 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
......@@ -66,6 +66,9 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
DTYPE v = 1.0F/(float)sqrt((float)hSize);
w.SetDataRandn(0, v);
}
/*
......@@ -91,6 +94,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1);
//output = Softmax(MMul(x, w), -1);
}
}
......@@ -59,7 +59,7 @@ public:
~T2TOutput();
/* initialize the model */
void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &input);
......
......@@ -37,21 +37,27 @@ namespace transformer
class T2TTrainer
{
public:
/* device id */
int devID;
/* paramter number */
int argNum;
/* memory pool */
XMem * mem;
/* parameter array */
char ** argArray;
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* buffer size */
int bufSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
......@@ -73,8 +79,14 @@ public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */
int sBatchSize;
......@@ -88,6 +100,51 @@ public:
/* traing step number */
int nstep;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrics */
XList moments;
/* list of the 2nd order moment of the parameter matrics */
XList moments2nd;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
DTYPE labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each traing epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* indicates whether we use small memory footprint for backward process */
bool isSmallFootprint;
public:
/* constructor */
T2TTrainer();
......@@ -96,22 +153,74 @@ public:
~T2TTrainer();
/* initialize the trainer */
void Init(int argc, const char ** argv);
void Init(int argc, char ** argv);
/* train the model */
void Train(const char * fn, T2TModel * model);
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* load data to buffer */
int LoadBuf(FILE * file);
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */
int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* recale the output and gold tensors for normalized loss */
void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
};
......
......@@ -27,8 +27,10 @@ namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
vname[0] = '-';
......@@ -45,7 +47,7 @@ void LoadParamString(int argc, const char ** argv, const char * name, char * p,
strcpy(p, defaultP);
}
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP)
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
{
char vname[128];
vname[0] = '-';
......@@ -62,7 +64,7 @@ void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int
*p = defaultP;
}
void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP)
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
{
char vname[128];
vname[0] = '-';
......@@ -79,7 +81,7 @@ void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bo
*p = defaultP;
}
void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP)
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
{
char vname[128];
vname[0] = '-';
......@@ -96,11 +98,13 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
*p = defaultP;
}
void ShowParams(int argc, const char ** argv)
void ShowParams(int argc, char ** argv)
{
fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){
if(argv[i][0] == '-'){
if(argv[i][1] == 0)
continue;
if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
if(i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
......
......@@ -30,13 +30,16 @@ namespace transformer
extern FILE * tmpFILE;
/* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP);
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
/* show arguments */
void ShowParams(int argc, const char ** argv);
void ShowParams(int argc, char ** argv);
extern int llnum;
extern FILE * tf;
}
......
......@@ -19,11 +19,15 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include <time.h>
#include "Transformer.h"
#include "T2TModel.h"
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer
{
......@@ -32,30 +36,67 @@ int TransformerMain(int argc, const char ** argv)
{
if(argc == 0)
return 1;
fprintf(stderr, "%e\n", log(1e-8F));
char ** args = new char*[argc];
for(int i = 0; i < argc; i++){
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, argv);
ShowParams(argc, args);
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, args, "train", trainFN, "");
LoadParamString(argc, args, "model", modelFN, "");
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
LoadParamString(argc, argv, "train", trainFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
model.InitModel(argc, argv);
T2TTrainer tester;
tester.Init(argc, args);
if(strcmp(trainFN, "")){
T2TTrainer trainer;
trainer.Init(argc, argv);
trainer.Train(trainFN, &model);
}
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
tester.Test(testFN, outputFN, &model);
delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
for(int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
fclose(tmpFILE);
return 0;
}
}
\ No newline at end of file
}
......@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is the entrance of the low-level tensor library : NiuTrans.Tensor
......@@ -39,13 +39,24 @@ using namespace nts;
void SmallTest();
void TransposeTest();
void LittleTest();
void T2TTest();
void T2TTest2();
void PowerTest();
int main( int argc, const char ** argv )
{
//PowerTest();
//LittleTest();
//T2TTest();
//T2TTest2();
//return 0;
//_CrtSetBreakAlloc(123);
/* a tiny test */
SmallTest();
//SmallTest();
//_CrtDumpMemoryLeaks();
//return 0;
......@@ -63,6 +74,34 @@ int main( int argc, const char ** argv )
return 0;
}
void myRead(XTensor * tensor, const char * filename, const char * label)
{
FILE * file = fopen(filename, "rb");
if(file == NULL)
printf("%s\n", filename);
tensor->Read(file, label);
}
void myDump(XTensor * tensor, const char * filename, const char * label)
{
FILE * file = fopen(filename, "wb");
if(file == NULL)
printf("%s\n", filename);
tensor->Dump(file, label);
}
void PowerTest()
{
XTensor input;
XTensor output;
InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
myRead(&input, "1.txt", "");
_Power(&input, &output, 2);
output.Dump(stderr, "", 200);
}
void SmallTest()
{
XTensor a;
......@@ -87,7 +126,7 @@ void SmallTest()
d = a + b + c.Lin(0.5F);
XLink::CheckNetwork(&d);
XLink::ShowNetwork(stderr, &d);
//XLink::ShowNetwork(stderr, &d);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
......@@ -126,3 +165,128 @@ void TransposeTest()
delete[] data;
}
void LittleTest()
{
int a = 5000;
int b = 100000;
int c = a*b;
printf("%d\n", c);
exit(1);
}
void T2TTest()
{
XTensor * input;
XTensor * weight;
XTensor * output;
XTensor * gold;
XTensor * dedy;
XTensor * dedx;
XTensor * dedxTmp;
XTensor * dedw;
XTensor * padding;
DTYPE loss;
int * dimSize = new int[2];
dimSize[0] = 256;
dimSize[1] = 10001;
int * dimSize2 = new int[3];
dimSize2[0] = 2;
dimSize2[1] = 31;
dimSize2[2] = 256;
int * dimSize3 = new int[3];
dimSize3[0] = 2;
dimSize3[1] = 31;
dimSize3[2] = 10001;
int * dimSize4 = new int[2];
dimSize4[0] = 2;
dimSize4[1] = 31;
input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
//weight = NewTensor(2, dimSize);
//dedw = NewTensor(2, dimSize);
//input = NewTensor(3, dimSize2);
//gold = NewTensor(3, dimSize3);
//output = NewTensor(3, dimSize3);
//dedy = NewTensor(3, dimSize3);
//dedx = NewTensor(3, dimSize3);
//dedxTmp = NewTensor(3, dimSize3);
//padding = NewTensor(2, dimSize4);
myRead(input, "x.txt", "x");
myRead(weight, "w.txt", "w");
myRead(gold, "gold.txt", "gold");
myRead(padding, "padding.txt", "padding");
XTensor inter;
inter = MMul(*input, *weight);
_Softmax(&inter, output, 2);
//_LogMe(output);
loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
printf("loss: %f\n", loss);
_CrossEntropyBackward(dedy, output, gold, NULL);
//_CrossEntropyBackward(dedy, output, gold, NULL, padding);
myDump(dedy, "dedy.txt", "dedy");
_SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
_Sub(output, gold, dedxTmp);
myDump(dedx, "dedx.txt", "dedx");
dedx->Dump(stderr, "dedx", 200);
dedxTmp->Dump(stderr, "dedxTmp", 200);
input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
_MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
myDump(dedw, "dedw.txt", "dedw");
}
void T2TTest2()
{
int dimSize[3];
dimSize[0] = 161;
dimSize[1] = 47;
dimSize[2] = 10001;
XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//myRead(probs, "probs.txt", " ");
_SetDataFixedFloat(probs, 1.0F);
probs->Reshape(1, probs->unitNum);
DTYPE sum = _ReduceSumAll(probs);
printf("%e\n", sum);
//XTensor tmp;
//tmp = IsNonZero(*probs);
//DTYPE nonZeroNum = ReduceSumAll(tmp);
//printf("%f\n", nonZeroNum);
//
//DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//printf("%e\n", gpu);
}
......@@ -50,8 +50,8 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h);
void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
#ifdef USE_CUDA
void CudaConvertDataType(int devID,
......
......@@ -41,6 +41,7 @@ XDevManager GDevs;
XDevice::XDevice()
{
stream = NULL;
isInitialized = false;
Clear();
#ifdef USE_CUDA
......@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
#endif
}
isInitialized = true;
}
/* clear it */
......@@ -152,11 +154,14 @@ void XDevice::Clear()
/* get cublas handle */
cublasHandle_t * XDevice::GetCublasHandle()
{
if (!isInitialized)
Init(devID);
if(!isHandleReady){
MUTEX_LOCK(cublasMutex);
int devIDBackup = 0;
ProtectCudaDev(devID, devIDBackup);
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess,
CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS,
"Cannot create the cublas handle.");
isHandleReady = true;
BacktoCudaDev(devID, devIDBackup);
......@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
{
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
......@@ -279,33 +287,13 @@ void XDevManager::Init()
exit(1);
}
cudaDeviceProp prop[64];
for(int i = 0; i < GPUCount; i++){
GPUs[i].Init(i);
cudaGetDeviceProperties(&prop[i], i);
GPUs[i].devID = i;
//GPUs[i].Init(i);
}
#ifdef USA_CUDA_P2P
for(int i = 0; i < GPUCount; i++){
cudaSetDevice(i);
for(int j = 0; j < GPUCount; j++){
if(i == j)
continue;
int access;
cudaDeviceCanAccessPeer(&access, i, j);
bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
if(access != 0){
CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
}
}
}
#endif
#endif
nGPU = GPUCount;
}
......@@ -351,6 +339,9 @@ into blocks
*/
int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
{
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......@@ -402,6 +393,9 @@ into blocks
*/
int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
{
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......@@ -452,7 +446,7 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
CheckNTErrors((!(b & (b-1))), "Block size (x-axis) must be in 2^x");
CheckNTErrors((gXSize <= GPUs[devID].GPUMaxGridSize[0] &&
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
gYSize <= GPUs[devID].GPUMaxGridSize[1]), "A too large grid size.");
blockSize[0] = bXSize;
blockSize[1] = bYSize;
......@@ -530,7 +524,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
for(int i = 0; i < terms->count; i++){
int devC, devID;
char dev[32];
char dev[32] = "";
char * curDevInfo = (char*)terms->GetItem(i);
if(sscanf(curDevInfo, "%d:%s", &devC, dev) < 2){
......
......@@ -67,6 +67,9 @@ public:
/* warp size of an (Navida) GPU */
int GPUWarpSize;
/* indicates whether the device class has been initialized */
bool isInitialized;
/*
max grid size (or number of blocks) of an (Navida) GPU
NOTE: the grid size is alone with three dimensions (x, y, z)
......
......@@ -43,7 +43,7 @@
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
#define _XINLINE_ inline
#define _XINLINE_
//#define DOUBELPRICSION
......@@ -55,6 +55,9 @@ namespace nts {
#define DTYPE_MIN (DTYPE)-3.40E+38
#endif
#define LOGPROB_MIN (DTYPE)-2E+1
#define GRAD_MAX (DTYPE)1E+5
#if WIN32
#define DELIMITER '\\'
#else
......@@ -62,10 +65,10 @@ namespace nts {
#endif
#ifndef MIN
#define MIN(a,b) ((a < b) ? a : b)
#define MIN(a,b) ((a) < (b) ? a : b)
#endif
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#define MAX(a,b) ((a) > (b) ? a : b)
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
......@@ -147,6 +150,8 @@ extern bool useCUDA;
#define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
#define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
#define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
#define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true
......
......@@ -263,6 +263,18 @@ int XLink::GetParamInt(int i)
char * p = (char*)params + i * paramSize;
return *(int*)p;
}
/*
get a paramter in integer
>> i - id of the parameter
<< return - the parameter in integer
*/
void * XLink::GetParamPointer(int i)
{
CheckNTErrors(params != NULL, "parameter array cannot be empty!");
char * p = (char*)params + i * paramSize;
return *(int **)p;
}
/*
get a parameter in MATRIX_TRANS_TYPE
......@@ -401,8 +413,7 @@ add a boolean parameter
*/
void XLink::AddParamToHeadBool(XTensor * h, bool param)
{
if(h != NULL)
return;
CheckNTErrors(h != NULL, "head tensor cannot be empty!");
h->income.AddParam(&param, sizeof(bool));
}
......@@ -413,8 +424,7 @@ add a pointer parameter
*/
void XLink::AddParamToHeadPointer(XTensor * h, void * param)
{
if(h != NULL)
return;
CheckNTErrors(h != NULL, "head tensor cannot be empty!");
h->income.AddParam(&param, sizeof(param));
}
......@@ -583,15 +593,15 @@ void XLink::CheckNetwork(XTensor * root)
}
/*
show the network encoded in a root node (tensor)
show a node
>> file - file to dump information
>> root - pointer to the root node
>> root - pointer to the node
*/
void XLink::ShowNetwork(FILE * file, XTensor * root)
void XLink::ShowNode(FILE * file, XTensor * node)
{
fprintf(file, "node %d - ", root->id);
fprintf(file, "node %d - ", node->id);
XLink &income = root->income;
XLink &income = node->income;
if(income.head == NULL){
fprintf(file, "income[%d]: null ", income.tailNum);
}
......@@ -607,7 +617,7 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
}
fprintf(stderr, ", ");
XLink &outgo = root->outgo;
XLink &outgo = node->outgo;
if(outgo.head == NULL || outgo.tailNum == 0){
fprintf(file, "outgo[%d]: null ", outgo.tailNum);
}
......@@ -623,11 +633,6 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
}
fprintf(stderr, "\n");
for(int i = 0; i < income.tailNum; i++){
XTensor * child = income.tails[i];
ShowNetwork(file, child);
}
}
} // namespace nts(NiuTrans.Tensor)
......
......@@ -127,6 +127,9 @@ struct XLink
/* get a paramter in integer */
int GetParamInt(int i);
/* get a paramter in pointer */
void * GetParamPointer(int i);
/* get a parameter in MATRIX_TRANS_TYPE */
MATRIX_TRANS_TYPE GetParamTrans(int i);
......@@ -175,9 +178,9 @@ struct XLink
static
void CheckNetwork(XTensor * root);
/* show the network encoded in a root node (tensor) */
/* show a node */
static
void ShowNetwork(FILE * file, XTensor * root);
void ShowNode(FILE * file, XTensor * node);
};
} // namespace nts(NiuTrans.Tensor)
......
......@@ -53,28 +53,14 @@ typedef long long INT_64;
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* disired size of the block */
MTYPE sizeDesired;
};
/*
mode of runnig a memory pool
- UNI_FREE: free all memory space when the memory allocation is no use
- FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
*/
enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY};
struct MPieceNode;
/* header of a memory piece (FREE_ON_THE_FLY) */
struct MHeader
......@@ -96,6 +82,9 @@ struct MHeader
/* id of the memory block */
int blockID;
/* pointer to the index node */
MPieceNode * indexNode;
};
/* index of memory piece */
......@@ -112,6 +101,31 @@ struct MPieceNode
/* pointer to the head of a memory piece */
void * p;
/* pointer to the head of memory that is returned back to the user */
void * pReal;
/* header of the memory piece */
MHeader head;
};
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* desired size of the block */
MTYPE sizeDesired;
/* first head of the block */
MHeader * head;
};
/*
......@@ -138,6 +152,9 @@ public:
/* mode of running the memory pool */
MEMPOOL_MODE mode;
/* signature */
MTYPE signature;
/* indicates whether the memory allocation is static */
bool isStatic;
......@@ -194,13 +211,16 @@ public:
public:
/* index of the free memory pieces */
MPieceNode * freeMemIndex;
MPieceNode * memIndex;
/* for double buffering */
MPieceNode * memIndex2;
/* maximum number of index nodes */
INT_64 indexNodeNum;
INT_64 nodeNum;
/* count of the used nodes */
INT_64 indexNodeNumUsed;
INT_64 nodeNumUsed;
/* minimal size allocation for each index entry */
MTYPE * minSizeIndex;
......@@ -211,6 +231,9 @@ public:
/* index offset */
int indexOffset;
/* indicates whether we merge free memory pieces on the fly */
bool mergeFreeOTF;
public:
/* constructor */
......@@ -235,6 +258,9 @@ public:
/* free a piece of memory */
void Free(int myDevID, void * mem);
/* get signature */
MTYPE GetSignature();
/* use string as the name of the memory pool */
void SetName(const char * myName);
......@@ -282,10 +308,10 @@ public:
void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
/* release a piece of memory */
void Release(void * p);
void Release(void * p, MTYPE size, MTYPE code);
/* release a piece of memory */
void Release(int myDevID, void * p);
void Release(int myDevID, void * p, MTYPE size);
/* release a piece of memory in the buffer */
void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
......@@ -302,14 +328,20 @@ public:
/* find the index entry for allocation query */
int FindIndexEntry(MTYPE mySize);
/* remove an index node */
/* remove an index node for available memory pieces */
void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node */
void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for memory pieces in use */
void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* release a piece of memory as "free" */
void ReleaseStandard(int myDevID, void * p);
void ReleaseStandard(int myDevID, void * p, MTYPE size);
/* rebuild index to merge small fragments of memory and free the block with no use */
void RebuildIndex();
......@@ -379,6 +411,9 @@ public:
extern XMem * GMem;
extern int testxmemid;
extern void * recordp;
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
......@@ -29,10 +29,22 @@ const char * GetOPName(int type)
if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE";
else if (type == MATH_CEIL)
return "M_CEIL";
else if (type == MATH_EXP)
return "M_EXP";
else if (type == MATH_FLOOR)
return "M_FLOOR";
else if (type == MATH_ISNONZERO)
return "M_ISNONZERO";
else if (type == MATH_ISZERO)
return "M_ISZERO";
else if (type == MATH_LOG)
return "M_LOG";
else if (type == MATH_SQRT)
return "M_SQRT";
else if (type == MATH_SQUARE)
return "M_SQUARE";
else if (type == MATH_SIN)
return "M_SIN";
else if (type == MATH_COS)
......@@ -45,12 +57,16 @@ const char * GetOPName(int type)
return "M_CLIP";
else if (type == MATH_DIV)
return "M_DIV";
else if (type == MATH_DIVDIM)
return "M_DIVDIM";
else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if (type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED";
else if (type == MATH_MULTIPLY)
return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM)
return "M_MULTIPLYDIM";
else if (type == MATH_NEGATE)
return "M_NEGATE";
else if (type == MATH_NORMALIZE)
......@@ -61,10 +77,12 @@ const char * GetOPName(int type)
return "M_SCALEANDSHIFT";
else if (type == MATH_SIGN)
return "M_SIGN";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUB)
return "M_SUB";
else if (type == MATH_SUBDIM)
return "M_SUBDIM";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUMDIM)
return "M_SUMDIM";
else if (type == REDUCE_REDUCEMAX)
......@@ -79,12 +97,20 @@ const char * GetOPName(int type)
return "R_REDUCEVARIANCE";
}
else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_SELECT)
if (type == GETANDSET_CONVERTDATATYPE)
return "G_CONVERTDATATYPE";
else if (type == GETANDSET_INDEXTOONEHOT)
return "G_INDEXTOONEHOT";
else if (type == GETANDSET_ONEHOTTOINDEX)
return "G_ONEHOTTOINDEX";
else if (type == GETANDSET_SELECT)
return "G_SELECT";
else if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
else if (type == MOVEMENT_GATHER)
return "M_GATHER";
else if (type == SHAPE_CONCATENATE)
return "S_CONCATENATE";
else if (type == SHAPE_MERGE)
......@@ -93,10 +119,14 @@ const char * GetOPName(int type)
return "S_MERGE_LIST";
else if (type == SHAPE_PERMUTE)
return "S_PERMUTE";
else if (type == SHAPE_RESHAPE)
return "S_RESHAPE";
else if (type == SHAPE_SPLIT)
return "S_SPLIT";
else if (type == SHAPE_SPLIT_LIST)
return "S_SPLIT_LIST";
else if (type == SHAPE_SQUEEZE)
return "S_SQUEEZE";
else if (type == SHAPE_TRANSPOSE)
return "S_TRANSPOSE";
else if (type == SHAPE_UNSQUEEZE)
......@@ -107,7 +137,9 @@ const char * GetOPName(int type)
return "S_TOPK";
}
else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH)
if (type == FUNC_DROPOUT)
return "F_DROPOUT";
else if (type == FUNC_HARDTANH)
return "F_HARDTANH";
else if (type == FUNC_IDENTITY)
return "F_IDENTITY";
......
......@@ -32,26 +32,35 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_BASE 0x00001000
#define MATH_ABSOLUTE MATH_BASE + 1
#define MATH_EXP MATH_ABSOLUTE + 1
#define MATH_LOG MATH_EXP + 1
#define MATH_SIN MATH_LOG + 1
#define MATH_CEIL MATH_ABSOLUTE + 1
#define MATH_EXP MATH_CEIL + 1
#define MATH_FLOOR MATH_EXP + 1
#define MATH_ISNONZERO MATH_FLOOR + 1
#define MATH_ISZERO MATH_ISNONZERO + 1
#define MATH_LOG MATH_ISZERO + 1
#define MATH_SQRT MATH_LOG + 1
#define MATH_SQUARE MATH_SQRT + 1
#define MATH_SIN MATH_SQUARE + 1
#define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1
#define MATH_CLIP MATH_ROUND + 1
#define MATH_DIV MATH_CLIP + 1
#define MATH_MATRIXMUL MATH_DIV + 1
#define MATH_DIVDIM MATH_DIV + 1
#define MATH_MATRIXMUL MATH_DIVDIM + 1
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_NEGATE MATH_MULTIPLY + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
#define MATH_SIGN MATH_SCALEANDSHIFT + 1
#define MATH_SUM MATH_SIGN + 1
#define MATH_SUB MATH_SUM + 1
#define MATH_SUMDIM MATH_SUB + 1
#define MATH_SUB MATH_SIGN + 1
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define REDUCE MATH_SUMDIM + 1
#define REDUCE_REDUCEMAX REDUCE + 1
......@@ -63,20 +72,26 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* data and shape related operations */
#define DATA_BASE MATH_BASE * 2
#define GETANDSET DATA_BASE + 1
#define GETANDSET_SELECT GETANDSET + 1
#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
#define GETANDSET_SELECT GETANDSET_ONEHOTTOINDEX + 1
#define MOVEMENT GETANDSET_SELECT + 1
#define MOVEMENT_COPYINDEXED MOVEMENT + 1
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
#define SHAPE MOVEMENT_COPYVALUES + 1
#define SHAPE MOVEMENT_GATHER + 1
#define SHAPE_CONCATENATE SHAPE + 1
#define SHAPE_MERGE SHAPE_CONCATENATE + 1
#define SHAPE_MERGE_LIST SHAPE_MERGE + 1
#define SHAPE_PERMUTE SHAPE_MERGE_LIST + 1
#define SHAPE_SPLIT SHAPE_PERMUTE + 1
#define SHAPE_RESHAPE SHAPE_PERMUTE + 1
#define SHAPE_SPLIT SHAPE_RESHAPE + 1
#define SHAPE_SPLIT_LIST SHAPE_SPLIT + 1
#define SHAPE_TRANSPOSE SHAPE_SPLIT_LIST + 1
#define SHAPE_SQUEEZE SHAPE_SPLIT_LIST + 1
#define SHAPE_TRANSPOSE SHAPE_SQUEEZE + 1
#define SHAPE_UNSQUEEZE SHAPE_TRANSPOSE + 1
#define SORT SHAPE_UNSQUEEZE + 1
......@@ -85,7 +100,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* activation functions */
#define FUNCTION_BASE DATA_BASE * 2
#define FUNC_HARDTANH FUNCTION_BASE + 1
#define FUNC_DROPOUT FUNCTION_BASE + 1
#define FUNC_HARDTANH FUNC_DROPOUT + 1
#define FUNC_IDENTITY FUNC_HARDTANH + 1
#define FUNC_LOGSOFTMAX FUNC_IDENTITY + 1
#define FUNC_RECTIFY FUNC_LOGSOFTMAX + 1
......
......@@ -49,9 +49,10 @@ struct XLink;
#define USE_BATCHED_STRIDED_MAT_MUL
#define MIN_TENSOR_SPLIT_NUM 0
#define MIN_TENSOR_SPLIT_LIST_NUM 1024
#define MIN_TENSOR_MERGE_NUM 0
#define MIN_TENSOR_MERGE_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8
/* computation flags */
#define UNSAFE_BUT_FAST_MEM
#define FAST_MATRIX
......@@ -66,6 +67,9 @@ public:
/* memory pool */
XMem * mem;
/* signature of the memory pool */
MTYPE signature;
/* data array to keep the elements */
void * data;
......@@ -143,6 +147,9 @@ public:
/* indicates whether the tensor keeps the gradient when used as model parameters */
bool isGrad;
/* indicates whether the tensor is used as paramters (or variables) */
bool isVar;
/* mark for traversing the gragh */
unsigned int visitMark;
......@@ -199,15 +206,27 @@ public:
/* overloading of the plus-sign */
XTensor operator+ (const XTensor &tensor);
/* overloading of the plus-sign */
XTensor operator+ (const DTYPE shift);
/* overloading of the multiply-sign */
XTensor operator* (const XTensor &tensor);
/* overloading of the multiply-sign */
XTensor operator* (const DTYPE scale);
/* overloading of the minus-sign */
XTensor operator- (const XTensor &tensor);
/* overloading of the minus-sign */
XTensor operator- (const DTYPE shift);
/* overloading of the division-sign */
XTensor operator/ (const XTensor &tensor);
/* overloading of the division-sign */
XTensor operator/ (const DTYPE scale);
/* linear transformation */
XTensor Lin(DTYPE scale, DTYPE shift = 0);
......@@ -218,13 +237,13 @@ public:
/* judge whether the three matrices are in the same type and size */
static
bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
/* set the size of each dimension */
void SetDim(int * myDimSize);
/* get the size of a given dimension */
int GetDim(const int dim);
int GetDim(const int dim) const;
/* reshape the tensor */
void Reshape(const int order, const int * myDimSize);
......@@ -244,18 +263,30 @@ public:
/* get unit size in terms of "dataType" */
int GetUnitSize(TENSOR_DATA_TYPE myDataType);
/* get offset (2D) */
MTYPE GetOffset2D(int row, int col);
/* get offset (3D) */
MTYPE GetOffset3D(int d0, int d1, int d2);
/* a tensor with all entries of 0 */
void SetZeroAll(XStream * stream = NULL);
/* set the tensor with an data array */
void SetData(const void * d, int num, int beg = 0);
/* set the tensor items by a uniform distribution */
void SetDataRand(DTYPE lower, DTYPE upper);
/* set tensor items by a uniform distribution */
void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);
/* set the tensor items by a normal distribution */
/* set tensor items by a normal distribution */
void SetDataRandn(DTYPE mean, DTYPE standardDeviation);
/* set tensor items with an array of offsets */
void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
/* set tensor items with an array of values */
void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0);
......@@ -309,6 +340,18 @@ public:
/* set the value of a cell in a 3d tensor */
bool Set3D(DTYPE value, int d0, int d1, int d2);
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
/* set the integer value of a cell in a 2d tensor */
bool Set2DInt(int value, int ni, int mi);
/* set the integer value of a cell in a 3d tensor */
bool Set3DInt(int value, int d0, int d1, int d2);
/* increase the value of a cell in a 2d */
bool Add2D(DTYPE value, int ni, int mi);
......@@ -317,21 +360,19 @@ public:
int GetNonzeroSize();
/* set the tensor as "temporary" */
void SetTMP(bool myIsTmp = true);
void SetTMPFlag(bool myIsTmp = true);
/* set the tensor as "keep-gradient" */
void SetGrad(bool myIsGrad = true);
void SetGradFlag(bool myIsGrad = true);
/* set the tensor as "variable" */
void SetVarFlag(bool myIsVar = true);
/* resize a matrix with a specified matrix size */
bool Resize(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F);
/* resize a matrix with a specified matrix size (with no data filled) */
bool ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F);
/* resize a matrix by another one */
bool Resize(const XTensor * myTensor);
......@@ -339,11 +380,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */
void Read(FILE * file, const char * label = NULL);
......@@ -435,7 +476,7 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
const int myDevID = -1, XMem * myMem = NULL);
/* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(XTensor * a, bool isFilledData = true);
XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
/* free the data space of a given tensor */
void DelTensor(XTensor * tensor);
......
......@@ -491,6 +491,21 @@ bool SetToDevice(int devID, void * p, DTYPE value)
return true;
}
/* assign a integer number to a variable that is kept on a specified device */
bool SetToDeviceInt(int devID, void * p, int value)
{
if(p == NULL)
return false;
if(devID < 0)
*(int*)p = value;
else{
XMemCopy(p, devID, &value, -1, sizeof(int));
}
return true;
}
/* get the next number with power of 2 */
unsigned int GetNextPower2(unsigned int n)
{
......
......@@ -50,6 +50,7 @@ extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value);
extern bool SetToDeviceInt(int devID, void * p, int value);
extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime);
extern double GetClock();
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
/* this is a header to include all functions in the "core" workspace */
......@@ -27,15 +27,18 @@
#include "../XTensor.h"
#include "arithmetic/Div.h"
#include "arithmetic/DivDim.h"
#include "arithmetic/MatrixMul.h"
#include "arithmetic/MatrixMul2D.h"
#include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h"
#include "arithmetic/Negate.h"
#include "arithmetic/Sign.h"
#include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
......@@ -43,16 +46,17 @@
#include "arithmetic/XTensorBLAS.h"
#include "getandset/ConvertDataType.h"
#include "getandset/OnehotAndIndex.h"
#include "getandset/Select.h"
#include "getandset/SetData.h"
#include "math/Clip.h"
#include "math/Compare.h"
#include "math/Normalize.h"
#include "math/Power.h"
#include "math/ScaleAndShift.h"
#include "math/Unary.h"
#include "movement/CopyBlocks.h"
#include "movement/CopyBlocksInGrid.h"
#include "movement/CopyBlocksOnSite.h"
......@@ -60,11 +64,14 @@
#include "movement/CopyIndexed.h"
#include "movement/CopyInGrid.h"
#include "movement/CopyValues.h"
#include "movement/Gather.h"
#include "movement/Spread.h"
#include "reduce/ReduceMax.h"
#include "reduce/ReduceMean.h"
#include "reduce/ReduceStandardVariance.h"
#include "reduce/ReduceSum.h"
#include "reduce/ReduceSumAll.h"
#include "reduce/ReduceSumSquared.h"
#include "reduce/ReduceVariance.h"
......@@ -74,8 +81,10 @@
#include "shape/MakeSplitBlockIndex.h"
#include "shape/Merge.h"
#include "shape/MergeBlockLists.h"
#include "shape/Reshape.h"
#include "shape/Permute.h"
#include "shape/Split.h"
#include "shape/Squeeze.h"
#include "shape/Transpose.h"
#include "shape/Unsqueeze.h"
......@@ -85,4 +94,4 @@
#include "utilities/XMatrixSegment.h"
#include "utilities/FlushToMem.h"
#endif // __CHEADER_H__
\ No newline at end of file
#endif // __CHEADER_H__
......@@ -23,6 +23,7 @@
#include "../../XName.h"
#include "Div.h"
#include "Div.cuh"
#include "DivDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -137,8 +138,37 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
_Div(a, b, a, alpha, leadingDim);
}
/*
return a dimension if the division is performed as DivDim (in more details in DivDim.h)
>> a - a tensor
>> b - another tensor for division
*/
int GetDivDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
}
/*
element-wise division of two tensors (return a XTensor structure)
element-wise division of two tensors (return an XTensor structure)
make a new tensor c to keep the result and return it
c(i) = a(i)*b(i)
......@@ -146,23 +176,41 @@ where i is the index of the item
>> a - tensor a
>> b - tensor b
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors
*/
XTensor Div(const XTensor &a, const XTensor &b, int leadingDim)
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
{
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
XTensor c(&a);
c.SetTMP();
/* call _Multiply function */
_Div(&a, &b, &c, 0, leadingDim);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHeadInt(&c, leadingDim);
c.SetTMPFlag();
int n = GetDivDimIndex(a, b);
if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */
_Div(&a, &b, &c, alpha, leadingDim);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
else if(n >= 0 && n < a.order){
/* call _DivDim function */
_DivDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
else{
ShowNTErrors("Something is wrong!");
}
return c;
}
......
......@@ -31,7 +31,7 @@ element-wise division of two tensors:
c(i) = a(i)/b(i) + \alpha * c(i)
where i is the index of the element
*/
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise division of two tensors (do it on site)
......@@ -39,15 +39,15 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the element
*/
void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise division of two tensors (return a XTensor structure)
element-wise division of two tensors (return an XTensor structure)
make a new tensor to keep the result and return it
c(i) = a(i)/b(i)
where i is the index of the element
*/
XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
} // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "Div.h"
#include "DivDim.h"
#include "DivDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put result. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in addition!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Div(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaDivDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++){
if(alpha == 0.0F)
cp[k] = ap[k] / bv;
else
cp[k] = ap[k] / bv + alpha * cp[k];
}
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor division of two tensors (do it on site)
keep the result in the input tensor and return nothing
a = a/b + \alpha * a
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_DivDim(a, b, a, n, alpha);
}
/*
tensor division of two tensors (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor division
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _Div function */
_DivDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "DivDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division of a tensor and a row vector
c = a / b + alpha * c
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
/*
tensor division of a tensor and a colum vector
c = a / b + alpha * c
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
}
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a / b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in division!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelDivWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
else
KernelDivWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_CUH__
#define __DIVDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting (cuda version)
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_H__
#define __DIVDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we keep the result in the input tensor a and return nothing
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we make a new tensor c to keep the result and return it
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_H__
......@@ -203,7 +203,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
/*
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
......@@ -249,7 +249,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
c.SetTMPFlag();
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
......@@ -299,7 +299,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
c.SetTMPFlag();
/* call _MatrixMul function */
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
......
......@@ -44,7 +44,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
/*
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor c to keep the result and return it
For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions.
......
......@@ -314,7 +314,7 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
c.SetTMPFlag();
/*call _MatrixMulBatched function */
_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
......@@ -370,7 +370,7 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
c.SetTMPFlag();
/*call _MatrixMulBatched function */
_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
......
......@@ -62,7 +62,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/*
matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
......@@ -74,7 +74,7 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
/*
matrix multiplication of the two tensors (return a XTensor structure) c = a * b * alpha
matrix multiplication of the two tensors (return an XTensor structure) c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
......
......@@ -23,6 +23,7 @@
#include "../../XName.h"
#include "Multiply.h"
#include "Multiply.cuh"
#include "MultiplyDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -65,8 +66,8 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
"Unmatched tensors!");
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
......@@ -138,8 +139,37 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
_Multiply(a, b, a, alpha, leadingDim);
}
/*
return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
>> a - a tensor
>> b - another tensor for multiplication
*/
int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
}
/*
element-wise product of two tensors (return a XTensor structure)
element-wise product of two tensors (return an XTensor structure)
make a new tensor c to keep the result and return it
c(i) = a(i)*b(i)
......@@ -150,20 +180,38 @@ where i is the index of the item
>> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors
*/
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
{
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
XTensor c(&a);
c.SetTMP();
c.SetTMPFlag();
/* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim);
int n = GetMultiplyDimIndex(a, b);
if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHeadInt(&c, leadingDim);
/* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
else if(n >= 0 && n < a.order){
/* call _MultiplyDim function */
_MultiplyDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
else{
ShowNTErrors("Something is wrong!");
}
return c;
}
......
......@@ -77,7 +77,7 @@ where |a_lead| means the size of the leading dimension of a
*/
template<int nonZeroAlpha> __global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
{
__shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......@@ -171,14 +171,12 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
if (alpha == 0) {
KernelMulElementWiseTensorDynamic<0> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
blockNum);
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
else {
KernelMulElementWiseTensorDynamic<1> << <blocks, threads >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
blockNum);
stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
}
}
}
......
......@@ -31,7 +31,7 @@ element-wise product of two tensors:
c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element
*/
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise product of two tensors (do it on site)
......@@ -39,15 +39,15 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i)
where i is the index of the element
*/
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise product of two tensors (return a XTensor structure)
element-wise product of two tensors (return an XTensor structure)
make a new tensor to keep the result and return it
c(i) = a(i)*b(i)
where i is the index of the element
*/
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
} // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in multiplication!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Multiply(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaMultiplyDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if(a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++)
if(alpha == 0.0F)
cp[k] = ap[k] * bv;
else
cp[k] = ap[k] * bv + alpha * cp[k];
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor multiplication(do it on site)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_MultiplyDim(a, b, a, n, alpha);
}
/*
tensor multiplication (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
*/
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论