Commit 27635638 by xiaotong

new code for ScaleAndShift

parent 43331674
......@@ -28,8 +28,7 @@
#include <time.h>
#include "XTensor.h"
#include "XDevice.h"
#include "./sample/fnnlm/FNNLM.h"
#include "sample/fnnlm/FNNLM.h"
#include "test/Test.h"
//#define CRTDBG_MAP_ALLOC
......@@ -39,26 +38,15 @@
using namespace nts;
using namespace samplefnnlm;
void SmallTest();
int main( int argc, const char ** argv )
{
//_CrtSetBreakAlloc(78);
/* a tiny test */
if(0){
XTensor a;
XTensor b;
InitTensor2D(&a, 2, 2);
a.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(1.0F, 1, 1);
b = Sum(a, Sum(a, a));
XTensor c = b;
a.Dump(stderr, "a: ");
b.Dump(stderr, "b: ");
}
if(1)
SmallTest();
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
......@@ -75,3 +63,25 @@ int main( int argc, const char ** argv )
return 0;
}
void SmallTest()
{
XTensor a;
XTensor b;
InitTensor2D(&a, 2, 2);
a.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 1, 1);
b = Sum(a, Multiply(a, a));
XTensor c = b;
XTensor d = b + a + Linear(c, 0.5F);
a.Dump(stderr, "a: ");
b.Dump(stderr, "b: ");
c.Dump(stderr, "c: ");
d.Dump(stderr, "d: ");
}
......@@ -74,7 +74,7 @@ namespace nts {
{ \
if(!(x)) \
{ \
fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
exit(1); \
} \
} \
......@@ -83,7 +83,7 @@ namespace nts {
{ \
if(!(x)) \
{ \
fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
exit(1); \
} \
} \
......@@ -91,7 +91,7 @@ namespace nts {
#define ShowNTErrors(msg) \
{ \
{ \
fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
exit(1); \
} \
} \
......
......@@ -19,15 +19,10 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
*/
#ifndef __XNAME_H__
#define __XNAME_H__
#include "XName.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_ARITHMETIC 0x00001000
#define MATH_SUM MATH_ARITHMETIC + 1
#define MATH_MULTIPLY MATH_SUM + 1
/* get operator name */
const char * GetOPName(int type)
{
......@@ -36,6 +31,8 @@ const char * GetOPName(int type)
return "M_SUM";
else if(type == MATH_MULTIPLY)
return "M_MULTIPLY";
else if(type == MATH_SCALEANDSHIFT)
return "M_SCALEANDSHIFT";
}
return "NULL";
......@@ -43,4 +40,3 @@ const char * GetOPName(int type)
} // namespace nts(NiuTrans.Tensor)
#endif // __XNAME_H__
......@@ -31,6 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_ARITHMETIC 10000
#define MATH_SUM MATH_ARITHMETIC + 1
#define MATH_MULTIPLY MATH_SUM + 1
#define MATH_SCALEANDSHIFT MATH_MULTIPLY + 1
/* get operator name */
const char * GetOPName(int type);
......
......@@ -40,6 +40,7 @@
#include "XBLAS.h"
#include "core/shape/MergeBlockLists.h"
#include "core/movement/CopyValues.h"
#include "core/arithmetic/Sum.h"
#ifdef USE_CUDA
......@@ -308,6 +309,12 @@ XTensor& XTensor::operator= (const XTensor& tensor)
return *this;
}
/* overloading of the plus-sign */
XTensor XTensor::operator+ (const XTensor& tensor)
{
return Sum(*this, tensor);
}
/*
judge whether the two matrices are in the same type and size
>> a - input tensor
......
......@@ -184,6 +184,9 @@ public:
/* overloading of the equal-sign */
XTensor& operator= (const XTensor &tensor);
/* overloading of the plus-sign */
XTensor operator+ (const XTensor &tensor);
/* judge whether the two matrices are in the same type and size */
static
bool IsIdentical(XTensor * a, XTensor * b);
......
......@@ -19,24 +19,28 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "ScaleAndShift.h"
#include "ScaleAndShift.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
scale and shift all tensor entires
p = p * scale + shift
>> a - the tensor
scale and shift all tensor entires b = a * scale + shift
b = a * scale + shift
>> a - the input tensor
>> b - the output tensor
>> scale - the scaler factor
>> shift - the shift factor
*/
void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
{
#ifdef USE_CUDA
/* run it on GPUs */
if(a->devID >= 0){
CudaScaleAndShift(a, scale, shift);
_CudaScaleAndShift(a, b, scale, shift);
return;
}
#endif
......@@ -46,7 +50,7 @@ void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
/* sparse tensor */
if(a->isSparse){
int num = a->GetNonzeroSize();
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
......@@ -65,4 +69,40 @@ void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
}
}
/*
scale and shift all tensor entires on site b = a * scale + shift
b = a * scale + shift
>> a - the input/output tensor
>> scale - the scaler factor
>> shift - the shift factor
*/
void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
{
_ScaleAndShift(a, a, scale, shift);
}
/*
scale and shift all tensor entires b = a * scale + shift
b = a * scale + shift
>> a - the input tensor
>> b - the output tensor
>> scale - the scaler factor
>> shift - the shift factor
*/
XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
{
XTensor b(&a);
b.SetTMP();
/* computation */
_ScaleAndShift(&a, &b, scale, shift);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SUM);
XLink::AddParamToHead(&b, scale);
XLink::AddParamToHead(&b, shift);
return b;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -19,7 +19,6 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "ScaleAndShift.h"
#include "ScaleAndShift.cuh"
#include "../../XDevice.h"
......@@ -28,68 +27,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
>> d - the data array
scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
template<bool isUnitScale, bool isZeroShift>
__global__
void KernelScaleAndShift(DTYPE * d, int size, DTYPE scale, DTYPE shift)
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
if (isUnitScale && !isZeroShift){
d[i] = d[i] + shift;
b[i] = a[i] + shift;
}
else if (isUnitScale && isZeroShift) {
d[i] = d[i];
b[i] = a[i];
}
else if (!isUnitScale && isZeroShift) {
d[i] = d[i] * scale;
b[i] = a[i] * scale;
}
else {
d[i] = d[i] * scale + shift;
b[i] = a[i] * scale + shift;
}
}
}
/*
scale and shift all matrix entires p = p * scale + shift (CUDA Kernel)
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
This is for float16 computation
>> d - the data array
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
__global__
void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if(i < size)
d[i] = __hadd(__hmul(d[i], scale), shift);
b[i] = __hadd(__hmul(a[i], scale), shift);
#else
if (i < size)
d[i] = __float2half(__half2float(d[i]) * __half2float(scale) + __half2float(shift));
b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
#endif
}
/*
scale and shift all matrix entires
scale and shift all tensor entires
p = p * scale + shift
>> a - the tensor
>> a - the input tensor
>> b - the output tensor
>> scale - the scaler factor
>> shift - the shift factor
*/
void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
{
/* sparse tensor */
if(a->isSparse){
// TODO
ShowNTErrors("TODO!");
}
/* dense tensor */
else{
......@@ -106,20 +108,20 @@ void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, a->unitNum, scale, shift);
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale);
unsigned short shift2 = FloatToFloat16(shift);
__half * scaleft16p = (__half*)&scale2;
__half * shiftft16p = (__half*)&shift2;
KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, a->unitNum, *scaleft16p, *shiftft16p);
KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
}
else{
ShowNTErrors("TODO!");
......
......@@ -22,23 +22,22 @@
#ifndef __SCALEANDSHIFT_CUH__
#define __SCALEANDSHIFT_CUH__
#include "../../XTensor.h"
#include "ScaleAndShift.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) */
/* scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) */
__global__
void KernelScaleAndShift(DTYPE * d, int size, DTYPE scale, DTYPE shift);
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift);
/* scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) with float16 data type */
/* scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) with float16 data type */
__global__
void KernelScaleAndShift(__half * d, int size, __half scale, __half shift);
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
/* scale and shift all tensor entires (cuda version) */
extern "C"
void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
/* scale and shift all tensor entires b = a * scale + shift (cuda version) */
void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
#endif // USE_CUDA
......
......@@ -26,9 +26,18 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* scale and shift all tensor entires */
extern "C"
void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
#define _Linear _ScaleAndShift
#define _LinearMe _ScaleAndShiftMe
#define Linear ScaleAndShift
/* scale and shift all tensor entires b = a * scale + shift */
void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0);
/* scale and shift all tensor entires on site a = a * scale + shift */
void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
/* scale and shift all tensor entires b = a * scale + shift, and return the result tensor b */
XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -40,7 +40,7 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
int num = input->dimSizeRDI[dimRDI];
ReduceSum(input, output, dim);
ScaleAndShift(output, (DTYPE)1/num, 0);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0);
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -39,7 +39,7 @@ void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean)
int dimRDI = input->order - dim - 1;
int num = input->dimSizeRDI[dimRDI];
ReduceSum(input, output, dim, mean, 2.0F);
ScaleAndShift(output, (DTYPE)1 / num, 0);
_ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
}
} // namespace nts(NiuTrans.Tensor)
......
......@@ -288,7 +288,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
/* \beta = \sum_i (dE/dy_i * y_i) */
Multiply(dedy, y, ytmp, 0);
_Multiply(dedy, y, ytmp, 0, 0);
ReduceSum(ytmp, beta, leadDim);
/* ytmp = dE/dy_j - \beta */
......@@ -296,7 +296,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
_Sum(dedy, ytmp, ytmp, -1.0F);
/* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
Multiply(y, ytmp, dedx, 0);
_Multiply(y, ytmp, dedx, 0, 0);
mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);
......
......@@ -485,7 +485,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
InitTensor(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */
Multiply(&output, &gold, &probs, 0);
_Multiply(&output, &gold, &probs);
/* probability of each word */
XTensor wprobs;
......
......@@ -54,8 +54,8 @@ bool TestLoss1()
/* initialize variables */
output->SetZeroAll();
gold->SetZeroAll();
ScaleAndShift(output, 1, 1);
ScaleAndShift(gold, 1, 2);
_ScaleAndShiftMe(output, 1, 1);
_ScaleAndShiftMe(gold, 1, 2);
DTYPE error;
error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
......@@ -74,8 +74,8 @@ bool TestLoss1()
/* Initialize variables */
outputGPU->SetZeroAll();
goldGPU->SetZeroAll();
ScaleAndShift(outputGPU, 1, 1);
ScaleAndShift(goldGPU, 1, 2);
_ScaleAndShiftMe(outputGPU, 1, 1);
_ScaleAndShiftMe(goldGPU, 1, 2);
/* call LossCompute function */
error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
......@@ -131,8 +131,8 @@ bool TestLoss2()
/* initialize variables */
output->SetZeroAll();
gold->SetZeroAll();
ScaleAndShift(output, 1, 1);
ScaleAndShift(gold, 1, 2);
_ScaleAndShiftMe(output, 1, 1);
_ScaleAndShiftMe(gold, 1, 2);
DTYPE error;
error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
......@@ -151,8 +151,8 @@ bool TestLoss2()
/* Initialize variables */
outputGPU->SetZeroAll();
goldGPU->SetZeroAll();
ScaleAndShift(outputGPU, 1, 1);
ScaleAndShift(goldGPU, 1, 2);
_ScaleAndShiftMe(outputGPU, 1, 1);
_ScaleAndShiftMe(goldGPU, 1, 2);
/* call LossCompute function */
error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
......
......@@ -81,7 +81,7 @@ bool TestMultiply1()
t->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(s1, s2, t, 0);
_Multiply(s1, s2, t, 0, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
......@@ -101,7 +101,7 @@ bool TestMultiply1()
tGPU->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(sGPU1, sGPU2, tGPU, 0);
_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
......@@ -189,7 +189,7 @@ bool TestMultiply2()
t->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(s1, s2, t, 0);
_Multiply(s1, s2, t, 0, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
......@@ -209,7 +209,7 @@ bool TestMultiply2()
tGPU->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(sGPU1, sGPU2, tGPU, 0);
_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
......@@ -296,7 +296,7 @@ bool TestMultiply3()
t->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(s1, s2, t, 1);
_Multiply(s1, s2, t, 0, 1);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
......@@ -316,7 +316,7 @@ bool TestMultiply3()
tGPU->SetZeroAll();
/* call MultiplyElementWise function */
Multiply(sGPU1, sGPU2, tGPU, 1);
_Multiply(sGPU1, sGPU2, tGPU, 0, 1);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
......
......@@ -57,7 +57,7 @@ bool TestScaleAndShift1()
s->SetData(sData, sUnitNum);
/* call ScaleAndShift function */
ScaleAndShift(s, scaleFactor, shiftFactor);
_ScaleAndShift(s, s, scaleFactor, shiftFactor);
/* check results */
cpuTest = s->CheckData(answer, sUnitNum);
......@@ -73,7 +73,7 @@ bool TestScaleAndShift1()
sGPU->SetData(sData, sUnitNum);
/* call ScaleAndShift function */
ScaleAndShift(sGPU, scaleFactor, shiftFactor);
_ScaleAndShift(sGPU, sGPU, scaleFactor, shiftFactor);
/* check results */
gpuTest = sGPU->CheckData(answer, sUnitNum);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论