Commit a73f8e42 by ltb

merge into xiao clip/scaleandshift(float16/int/int8) …

merge into xiao    clip/scaleandshift(float16/int/int8)            logsoftmax/hardtanh(float16)    modify  XGlobal  __int8
parent 3501c0fa
...@@ -32,6 +32,8 @@ ...@@ -32,6 +32,8 @@
#ifndef WIN32 #ifndef WIN32
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#include <stdint.h>
typedef int8_t __int8;
#endif #endif
// the CUDA stuff // the CUDA stuff
...@@ -43,6 +45,10 @@ ...@@ -43,6 +45,10 @@
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts {
#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
#define USE_CPP11
#endif
#define _XINLINE_ #define _XINLINE_
//#define DOUBELPRICSION //#define DOUBELPRICSION
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
...@@ -35,34 +36,20 @@ set each entry to its clip value (CUDA Kernel) ...@@ -35,34 +36,20 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border >> upper - the upper border
>> size - size of the data array >> size - size of the data array
*/ */
template <class T>
__global__ __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size) void KernelClip(T * a, T * b, T lower, T upper, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (a[i] > upper)
b[i] = upper;
else if (a[i] < lower)
b[i] = lower;
else
b[i] = a[i];
}
}
/* if (i < size) {
set each entry to its clip value with float16 data type value (CUDA Kernel) if (a[i] > upper)
This is for float16 computation b[i] = upper;
>> a - pointer to input data array else if (a[i] < lower)
>> b - pointer to output data array b[i] = lower;
>> lower - the lower border else
>> upper - the upper border b[i] = a[i];
>> size - size of the data array }
*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
return;
} }
/* /*
...@@ -88,12 +75,27 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -88,12 +75,27 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
int devIDBackup; int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) { if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum); KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
} }
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum); half lower1 = __float2half(lower);
} half upper1 = __float2half(upper);
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_INT8) {
__int8 lower1 = (__int8)lower;
__int8 upper1 = (__int8)upper;
KernelClip << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, lower1, upper1, a->unitNum);
}
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */ /* set each entry to its clip value (CUDA Kernel) */
template <class T>
__global__ __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size); void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value */ /* set each entry to its clip value */
void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper); void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int added
*/ */
#include "ScaleAndShift.cuh" #include "ScaleAndShift.cuh"
...@@ -34,9 +35,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) ...@@ -34,9 +35,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it >> scale - how much we want to scale it
>> shift - how much we want to shift it >> shift - how much we want to shift it
*/ */
template<bool isUnitScale, bool isZeroShift> template<class T, bool isUnitScale, bool isZeroShift>
__global__ __global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift) void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -56,28 +57,6 @@ void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shif ...@@ -56,28 +57,6 @@ void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shif
} }
} }
/*
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
This is for float16 computation
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
__global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if(i < size)
b[i] = __hadd(__hmul(a[i], scale), shift);
#else
if (i < size)
b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
#endif
}
/* /*
scale and shift all tensor entires scale and shift all tensor entires
...@@ -108,20 +87,52 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift ...@@ -108,20 +87,52 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){ if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0) if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0) else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0) else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
} }
else if(a->dataType == X_FLOAT16){ else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale); half scale1 = __float2half(scale);
unsigned short shift2 = FloatToFloat16(shift); half shift1 = __float2half(shift);
__half * scaleft16p = (__half*)&scale2;
__half * shiftft16p = (__half*)&shift2; if (scale == 1.0F && shift == 0)
KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p); KernelScaleAndShift<__half, true, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__half, true, false><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__half, false, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
else
KernelScaleAndShift<__half, false, false> << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
}
else if (a->dataType == X_INT){
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if (a->dataType == X_INT8){
__int8 scale2 = __int8(scale);
__int8 shift2 = __int8(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<__int8, true, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<__int8, true, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<__int8, false, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<__int8, false, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
} }
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-12 float16 added
*/ */
#include "HardTanH.h" #include "HardTanH.h"
...@@ -38,17 +39,18 @@ y = 1 if x > 1 ...@@ -38,17 +39,18 @@ y = 1 if x > 1
>> y - output data array >> y - output data array
>> size - size of input/output >> size - size of input/output
*/ */
__global__ template <class T>
void KernelHardtanhCompute(DTYPE * x, DTYPE * y, int size) __global__
void KernelHardtanhCompute(T * x, T * y, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){ if (i < size) {
DTYPE p = x[i]; T p = x[i];
if(p > (DTYPE)1.0) if (p >(T)1.0)
p = (DTYPE)1.0; p = (T)1.0;
else if(p < (DTYPE)-1.0) else if (p < (T)-1.0)
p = (DTYPE)-1.0; p = (T)-1.0;
y[i] = p; y[i] = p;
} }
} }
...@@ -63,25 +65,31 @@ y = 1 if x > 1 ...@@ -63,25 +65,31 @@ y = 1 if x > 1
*/ */
void _CudaHardTanH(const XTensor * x, XTensor * y) void _CudaHardTanH(const XTensor * x, XTensor * y)
{ {
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){ CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors."); CheckNTErrors((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length."); (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16),
"The hard tanh activation function does not support this datatype.");
int gridSize[3], blockSize[3]; int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize); GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup; int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup); ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum); KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
} }
else{ else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((__half *)x->data, (__half *)y->data, x->unitNum);
}
else {
//TODO!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
BacktoCudaDev(x->devID, devIDBackup);
} }
/* /*
...@@ -97,14 +105,15 @@ dy/dx = 1 if -1 <= x <= 1 ...@@ -97,14 +105,15 @@ dy/dx = 1 if -1 <= x <= 1
>> x - x of the function >> x - x of the function
>> size - size of y/x >> size - size of y/x
*/ */
template <class T>
__global__ __global__
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size) void KernelHardtanhBackward(T * dedy, T * dedx, T * gold, T * y, T * x, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){ if (i < size){
DTYPE s = x[i]; T s = x[i];
if(s > (DTYPE)1.0 || s < (DTYPE)-1.0) if(s > (T)1.0 || s < (T)-1.0)
dedx[i] = 0; dedx[i] = 0;
else else
dedx[i] = dedy[i]; dedx[i] = dedy[i];
...@@ -134,21 +143,24 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -134,21 +143,24 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){ CheckNTErrors(((x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) ||
(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16)),
"Input vectors are not in default type.");
/* calculate dE/dy */ /* calculate dE/dy */
if(lossName == CROSSENTROPY) if (lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold); _CudaCrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS) else if (lossName != NOLOSS)
_CudaLossBackward(dedy, gold, y, lossName); _CudaLossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3]; int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize); GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup; int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup); ProtectCudaDev(x->devID, devIDBackup);
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* dE/dx = dE/dy * dy/dx */ /* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>> KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data, ((DTYPE*)dedy->data,
...@@ -156,11 +168,18 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -156,11 +168,18 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
gold == NULL ? NULL : (DTYPE*)gold->data, gold == NULL ? NULL : (DTYPE*)gold->data,
(DTYPE*)y->data, (DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)x->data,
x->unitNum); x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
} }
else else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
ShowNTErrors("TODO!"); /* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]), dim3(blockSize[0])>>>
((half*)dedy->data,
(half*)dedx->data,
gold == NULL ? NULL : (half*)gold->data,
(half*)y->data, (half*)x->data,
x->unitNum);
}
BacktoCudaDev(x->devID, devIDBackup);
} }
#endif #endif
......
...@@ -50,121 +50,136 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -50,121 +50,136 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
} }
int leadDimRDI = x->order - leadDim - 1; int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) int * dimSize = new int[x->order - 1];
{ for (int i = 0; i < x->order; i++) {
int * dimSize = new int[x->order - 1]; if (i < leadDim)
for (int i = 0; i < x->order; i++) { dimSize[i] = -x->dimSize[i];
if (i < leadDim) else if (i > leadDim)
dimSize[i] = -x->dimSize[i]; dimSize[i - 1] = -x->dimSize[i];
else if (i > leadDim) }
dimSize[i - 1] = -x->dimSize[i];
}
XMem * mem = x->mem; XMem * mem = x->mem;
XTensor * max = NULL; XTensor * max = NULL;
XTensor * sum = NULL; XTensor * sum = NULL;
XTensor * blockx = NULL; XTensor * blockx = NULL;
XTensor * blocky = NULL; XTensor * blocky = NULL;
XTensor * blockMax = NULL; XTensor * blockMax = NULL;
XTensor * blockSum = NULL; XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSizeRDI[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
_ReduceMax(x, max, leadDim); _ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true); _ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) { if (x->devID >= 0) {
if(leadDimRDI == 0){ if(leadDimRDI == 0){
blockSize = y->unitNum; blockSize = y->unitNum;
blockNum = 1; blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem); blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem); blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem); blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem); blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
} }
else{ else{
blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem); blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem); blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem); blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem); blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
}
} }
}
for (int k = 0; k < blockNum; k++) { for (int k = 0; k < blockNum; k++) {
int m = stride; int m = stride;
int n = dimensionSize; int n = dimensionSize;
if (x->devID < 0) {
DTYPE * ip = (DTYPE*)x->data + k * blockSize; DTYPE * ip = (DTYPE*)x->data + k * blockSize;
DTYPE * op = (DTYPE*)y->data + k * blockSize; DTYPE * op = (DTYPE*)y->data + k * blockSize;
DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize; DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize; DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
if (x->devID < 0) { for (int j = 0; j < m; j++) {
for (int j = 0; j < m; j++) { DTYPE sumValue = sp[j];
DTYPE sumValue = sp[j]; if (sumValue == 0) {
if (sumValue == 0) { for (int i = 0; i < n; i++)
for (int i = 0; i < n; i++) op[i * m + j] = 0;
op[i * m + j] = 0; }
} else {
else { for (int i = 0; i < n; i++) {
for (int i = 0; i < n; i++) { DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]); if (IsNAN(r))
if (IsNAN(r)) r = LOGPROB_MIN;
r = LOGPROB_MIN; if (IsINF(r))
if (IsINF(r)) r = LOGPROB_MIN;
r = LOGPROB_MIN;
op[i * m + j] = MAX(r, LOGPROB_MIN);
op[i * m + j] = MAX(r, LOGPROB_MIN);
}
} }
} }
} }
}
else {
if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
DTYPE * ip = (DTYPE*)x->data + k * blockSize;
DTYPE * op = (DTYPE*)y->data + k * blockSize;
DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
blockx->data = ip;
blocky->data = op;
blockMax->data = mp;
blockSum->data = sp;
}
else { else {
half * ip = (half*)x->data + k * blockSize;
half * op = (half*)y->data + k * blockSize;
half * mp = (half*)max->data + k * blockSize / dimensionSize;
half * sp = (half*)sum->data + k * blockSize / dimensionSize;
blockx->data = ip; blockx->data = ip;
blocky->data = op; blocky->data = op;
blockMax->data = mp; blockMax->data = mp;
blockSum->data = sp; blockSum->data = sp;
}
#ifdef USE_CUDA #ifdef USE_CUDA
if(leadDimRDI == 0) if (leadDimRDI == 0)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
#else #else
ShowNTErrors("Please specify USE_CUDA and recompile the code!"); ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif #endif
blockx->data = NULL; blockx->data = NULL;
blocky->data = NULL; blocky->data = NULL;
blockMax->data = NULL; blockMax->data = NULL;
blockSum->data = NULL; blockSum->data = NULL;
}
} }
}
DelTensorBuf(max); DelTensorBuf(max);
DelTensorBuf(sum); DelTensorBuf(sum);
if (x->devID >= 0) {
delete blockx;
delete blocky;
delete blockMax;
delete blockSum;
}
delete[] dimSize; if (x->devID >= 0) {
delete blockx;
delete blocky;
delete blockMax;
delete blockSum;
} }
else
ShowNTErrors("TODO!"); delete[] dimSize;
} }
/* /*
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-01 float16 added
*/ */
#include "LogSoftmax.h" #include "LogSoftmax.h"
...@@ -26,6 +27,7 @@ ...@@ -26,6 +27,7 @@
#include "../core/reduce/ReduceSum.cuh" #include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh" #include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h" #include "../XDevice.h"
#include <device_launch_parameters.h>
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -57,11 +59,12 @@ y_{i,j} = log(e^x_{i,j} / \sum_{i} e^{x_{i,j}) ...@@ -57,11 +59,12 @@ y_{i,j} = log(e^x_{i,j} / \sum_{i} e^{x_{i,j})
>> rowNum - row number of the matrix >> rowNum - row number of the matrix
>> colNum - column number of the matrix >> colNum - column number of the matrix
*/ */
template <class T ,TENSOR_DATA_TYPE dataType>
__global__ __global__
void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, int rowNum, int colNum) void KernelLogSoftmaxComputeByRow(T * x, T * max, T * sum, T * y, int rowNum, int colNum)
{ {
__shared__ DTYPE inputSum[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T inputSum[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE inputMax[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T inputMax[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.y * blockIdx.y + threadIdx.y; int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -78,14 +81,21 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y ...@@ -78,14 +81,21 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
/* y_{i,j} = log(e^(s_{i,j} - max_{j}) / \sum_{k} e^{s_{k,j} - max_{j}}) */ /* y_{i,j} = log(e^(s_{i,j} - max_{j}) / \sum_{k} e^{s_{k,j} - max_{j}}) */
if (i < rowNum && j < colNum) { if (i < rowNum && j < colNum) {
int key = i * colNum + j; int key = i * colNum + j;
DTYPE r = log(exp(x[key] - inputMax[threadIdx.x]) / inputSum[threadIdx.x]);
if (isnan(r)) if (dataType == X_FLOAT) {
r = LOGPROB_MIN; DTYPE r = log((DTYPE)exp((DTYPE)(x[key] - inputMax[threadIdx.x])) / (DTYPE)inputSum[threadIdx.x]);
if (isinf(r))
r = LOGPROB_MIN;
y[key] = MAX(r, LOGPROB_MIN); if (isnan(r))
r = LOGPROB_MIN;
if (isinf(r))
r = LOGPROB_MIN;
y[key] = MAX(r, LOGPROB_MIN);
}
else if (dataType == X_FLOAT16) {
half r = hlog((half)hexp(x[key] - inputMax[threadIdx.y]) / (half)inputSum[threadIdx.y]);
y[key] = r;
}
} }
} }
...@@ -104,11 +114,12 @@ y_{i,j} = log(e^x_{i,j} / \sum_{j} e^{x_{i,j}) ...@@ -104,11 +114,12 @@ y_{i,j} = log(e^x_{i,j} / \sum_{j} e^{x_{i,j})
>> rowNum - row number of the matrix >> rowNum - row number of the matrix
>> colNum - column number of the matrix >> colNum - column number of the matrix
*/ */
template <class T ,TENSOR_DATA_TYPE dataType>
__global__ __global__
void KernelLogSoftmaxComputeByCol(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, int rowNum, int colNum) void KernelLogSoftmaxComputeByCol(T * x, T * max, T * sum, T * y, int rowNum, int colNum)
{ {
__shared__ DTYPE inputSum[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T inputSum[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE inputMax[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T inputMax[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int i = blockDim.y * blockIdx.y + threadIdx.y; int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -125,19 +136,20 @@ void KernelLogSoftmaxComputeByCol(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y ...@@ -125,19 +136,20 @@ void KernelLogSoftmaxComputeByCol(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
/* y_{i,j} = log(e^(s_{i,j} - max_{i}) / \sum_{k} e^{s_{i,k} - max_{i}}) */ /* y_{i,j} = log(e^(s_{i,j} - max_{i}) / \sum_{k} e^{s_{i,k} - max_{i}}) */
if (i < rowNum && j < colNum) { if (i < rowNum && j < colNum) {
int key = i * colNum + j; int key = i * colNum + j;
DTYPE r = log(exp(x[key] - inputMax[threadIdx.y]) / inputSum[threadIdx.y]); if (dataType == X_FLOAT) {
DTYPE r = log((DTYPE)exp((DTYPE)(x[key] - inputMax[threadIdx.y])) / (DTYPE)inputSum[threadIdx.y]);
/*if (r < LOGPROB_MIN)
{ if (isnan(r))
printf("min %e %e, %e %e, %e %e\n", r, x[key] - inputMax[threadIdx.y], x[key], inputMax[threadIdx.y], exp(x[key] - inputMax[threadIdx.y]), inputSum[threadIdx.y]); r = LOGPROB_MIN;
}*/ if (isinf(r))
r = LOGPROB_MIN;
if (isnan(r))
r = LOGPROB_MIN; y[key] = MAX(r, LOGPROB_MIN);
if (isinf(r)) }
r = LOGPROB_MIN; else if (dataType == X_FLOAT16) {
half r = hlog((half)hexp(x[key] - inputMax[threadIdx.y]) / (half)inputSum[threadIdx.y]);
y[key] = MAX(r, LOGPROB_MIN); y[key] = r;
}
} }
} }
...@@ -173,16 +185,42 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, ...@@ -173,16 +185,42 @@ void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
GDevs.GetCudaThread2D(x->devID, n, m, MAX_INT, gridSize, blockSize); GDevs.GetCudaThread2D(x->devID, n, m, MAX_INT, gridSize, blockSize);
/* y_{i,j} = log(e^(s_{i,j} - max_{j}) / \sum_{k} e^{s_{k,j} - max_{j}}) */ /* y_{i,j} = log(e^(s_{i,j} - max_{j}) / \sum_{k} e^{s_{k,j} - max_{j}}) */
KernelLogSoftmaxComputeByRow << <dim3(gridSize[1], gridSize[0]), dim3(blockSize[1], blockSize[0]) >> > KernelLogSoftmaxComputeByRow<DTYPE, DEFAULT_DTYPE> <<<dim3(gridSize[1], gridSize[0]), dim3(blockSize[1], blockSize[0])>>>
((DTYPE*)x->data, maxData, sumData, (DTYPE*)y->data, n, m); ((DTYPE*)x->data, maxData, sumData, (DTYPE*)y->data, n, m);
}
else {
GDevs.GetCudaThread2D(x->devID, m, n, MAX_INT, gridSize, blockSize);
/* y_{i,j} = log(e^(s_{i,j} - max_{i}) / \sum_{k} e^{s_{i,k} - max_{i}}) */
KernelLogSoftmaxComputeByCol<DTYPE, DEFAULT_DTYPE> <<<dim3(gridSize[0], gridSize[1]), dim3(blockSize[0], blockSize[1])>>>
((DTYPE*)x->data, maxData, sumData, (DTYPE*)y->data, n, m);
}
}
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
int gridSize[3], blockSize[3];
int n = x->dimSize[0];
int m = x->dimSize[1];
/* allocate the buffer */
__half * maxData = (half*)max->data;
__half * sumData = (half*)sum->data;
if (leadDim == 0) {
GDevs.GetCudaThread2D(x->devID, n, m, MAX_INT, gridSize, blockSize);
/* y_{i,j} = log(e^(s_{i,j} - max_{j}) / \sum_{k} e^{s_{k,j} - max_{j}}) */
KernelLogSoftmaxComputeByRow<half, X_FLOAT16> <<<dim3(gridSize[1], gridSize[0]), dim3(blockSize[1], blockSize[0])>>>
((half*)x->data, maxData, sumData, (half *)y->data, n, m);
} }
else { else {
GDevs.GetCudaThread2D(x->devID, m, n, MAX_INT, gridSize, blockSize); GDevs.GetCudaThread2D(x->devID, m, n, MAX_INT, gridSize, blockSize);
/* y_{i,j} = log(e^(s_{i,j} - max_{i}) / \sum_{k} e^{s_{i,k} - max_{i}}) */ /* y_{i,j} = log(e^(s_{i,j} - max_{i}) / \sum_{k} e^{s_{i,k} - max_{i}}) */
KernelLogSoftmaxComputeByCol << <dim3(gridSize[0], gridSize[1]), dim3(blockSize[0], blockSize[1]) >> > KernelLogSoftmaxComputeByCol<half, X_FLOAT16> <<<dim3(gridSize[0], gridSize[1]), dim3(blockSize[0], blockSize[1])>>>
((DTYPE*)x->data, maxData, sumData, (DTYPE*)y->data, n, m); ((half*)x->data, maxData, sumData, (half*)y->data, n, m);
} }
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
...@@ -200,18 +238,19 @@ set dE/dx = exp(y) ...@@ -200,18 +238,19 @@ set dE/dx = exp(y)
>> size - size of output >> size - size of output
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
template <class T>
__global__ __global__
void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName) void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossName)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) { if (i < size) {
/* dE/dx_j = exp(y_j) */ /* dE/dx_j = exp(y_j) */
if (lossName == CROSSENTROPY) if (lossName == CROSSENTROPY)
dedx[i] = exp(y[i]); dedx[i] = exp(((DTYPE)y[i]));
/* dE/dx_j = exp(y_j) */ /* dE/dx_j = exp(y_j) */
else if (lossName == SQUAREDERROR) else if (lossName == SQUAREDERROR)
dedx[i] = exp(y[i]); dedx[i] = exp(((DTYPE)y[i]));
else if (lossName == ONEHOTERROR) else if (lossName == ONEHOTERROR)
dedx[i] = 0; dedx[i] = 0;
else else
...@@ -232,36 +271,59 @@ dE/dx = dE/dy * dy/dx ...@@ -232,36 +271,59 @@ dE/dx = dE/dy * dy/dx
>> size - size of input/output >> size - size of input/output
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
template <class T, TENSOR_DATA_TYPE dataType>
__global__ __global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x,
int size, LOSS_FUNCTION_NAME lossName) int size, LOSS_FUNCTION_NAME lossName)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) { if (i < size) {
DTYPE r = 0; if (dataType == X_FLOAT) {
/* dE/ds_j = exp(y_j) */ DTYPE r = 0;
if (lossName == CROSSENTROPY) /* dE/ds_j = exp(y_j) */
r = -gold[i] + exp(y[i]); if (lossName == CROSSENTROPY)
/* dE/ds_j = exp(y_j) */ r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
else if (lossName == SQUAREDERROR) /* dE/ds_j = exp(y_j) */
r = -gold[i] + exp(y[i]); else if (lossName == SQUAREDERROR)
else if (lossName == ONEHOTERROR) { r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
if (gold[i] == 1.0F) else if (lossName == ONEHOTERROR) {
r = -gold[i] + exp(y[i]); if ((DTYPE)gold[i] == 1.0)
else r = -(DTYPE)gold[i] + (DTYPE)exp(((DTYPE)y[i]));
else
r = 0;
}
else {
r = dedy[i];
}
if (isnan(r))
r = 0;
if (isinf(r))
r = 0; r = 0;
}
else {
r = dedy[i];
}
if (isnan(r)) dedx[i] = r;
r = 0; }
if (isinf(r)) else if (dataType == X_FLOAT16) {
r = 0; half r = 0;
/* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY)
r = -(half)gold[i] + (half)hexp(y[i]);
/* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
r = -(half)gold[i] + (half)hexp(y[i]);
else if (lossName == ONEHOTERROR) {
if ((half)gold[i] == (half)1.0)
r = -(half)gold[i] + (half)hexp(y[i]);
else
r = 0;
}
else {
r = dedy[i];
}
dedx[i] = r; dedx[i] = r;
}
} }
} }
...@@ -282,11 +344,12 @@ dE/dx_j += -gold_j ...@@ -282,11 +344,12 @@ dE/dx_j += -gold_j
>> gNonZeroNum - >> gNonZeroNum -
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
template <class T>
__global__ __global__
void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * gold, DTYPE * y, DTYPE * x, void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T * y, T * x,
int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName) int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName)
{ {
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int) + sizeof(T);
int k = blockDim.x * blockIdx.x + threadIdx.x; int k = blockDim.x * blockIdx.x + threadIdx.x;
if (k < gNonZeroNum) { if (k < gNonZeroNum) {
...@@ -294,7 +357,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * ...@@ -294,7 +357,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void *
int key = *(int*)((char*)gold + tupleSize * k); int key = *(int*)((char*)gold + tupleSize * k);
int ni = key / colNum; int ni = key / colNum;
int mi = key % colNum; int mi = key % colNum;
int value = *(DTYPE*)((char*)gold + tupleSize * k + sizeof(int)); int value = *(T*)((char*)gold + tupleSize * k + sizeof(int));
if (lossName == CROSSENTROPY) if (lossName == CROSSENTROPY)
dedx[colNum * ni + mi] += -value; dedx[colNum * ni + mi] += -value;
...@@ -303,7 +366,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * ...@@ -303,7 +366,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void *
else if (lossName == ONEHOTERROR) { else if (lossName == ONEHOTERROR) {
int offset = colNum * ni + mi; int offset = colNum * ni + mi;
if (value == 1.0F) if (value == 1.0F)
dedx[offset] += (-value + exp(y[offset])); dedx[offset] += (-value + exp(((DTYPE)y[offset])));
//dedx[offset] += -value * 0.005; //dedx[offset] += -value * 0.005;
} }
} }
...@@ -383,6 +446,8 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -383,6 +446,8 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID), CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU."); "Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!"); CheckNTErrors((gold != NULL), "No x gold standard is found!");
CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
"Unknown loss function.");
int leadDimRDI = y->order - leadDim - 1; int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSizeRDI[leadDimRDI];
...@@ -397,10 +462,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -397,10 +462,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int devIDBackup; int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup); ProtectCudaDev(x->devID, devIDBackup);
if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) { if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
"Unknown loss function.");
int cudaGridSize[3], cudaBlockSize[3]; int cudaGridSize[3], cudaBlockSize[3];
...@@ -411,7 +473,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -411,7 +473,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);
/* dE/ds_j = exp(y_j) */ /* dE/ds_j = exp(y_j) */
KernelExpLoss <<<dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>> KernelExpLoss <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL, (NULL,
(DTYPE*)dedx->data, (DTYPE*)dedx->data,
(DTYPE*)y->data, (DTYPE*)y->data,
...@@ -421,7 +483,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -421,7 +483,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);
/* dE/ds_j += -gold_j */ /* dE/ds_j += -gold_j */
KernelLogSoftmaxBackwardDEDSSparseByRow <<<dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>> KernelLogSoftmaxBackwardDEDSSparseByRow <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL, (NULL,
(DTYPE*)dedx->data, (DTYPE*)dedx->data,
(char*)gold->data + sizeof(int), (char*)gold->data + sizeof(int),
...@@ -436,7 +498,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -436,7 +498,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
/* dE/ds_j = -gold_j + exp(y_j) */ /* dE/ds_j = -gold_j + exp(y_j) */
KernelLogSoftmaxBackwardDEDS <<<dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>> KernelLogSoftmaxBackwardDEDS <DTYPE, X_FLOAT> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL, (NULL,
(DTYPE*)dedx->data + k * blockSize, (DTYPE*)dedx->data + k * blockSize,
(DTYPE*)gold->data + k * blockSize, (DTYPE*)gold->data + k * blockSize,
...@@ -470,6 +532,76 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -470,6 +532,76 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
} }
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
int cudaGridSize[3], cudaBlockSize[3];
if (lossName == CROSSENTROPY || lossName == SQUAREDERROR) {
if (gold->isSparse) {
CheckNTErrors((gold->order == 2), "TODO!")
CheckNTErrors((leadDim == 0), "TODO!");
GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);
/* dE/ds_j = exp(y_j) */
KernelExpLoss <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data,
(__half*)y->data,
dimensionSize * stride,
lossName);
GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);
/* dE/ds_j += -gold_j */
KernelLogSoftmaxBackwardDEDSSparseByRow <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data,
(char*)gold->data + sizeof(int),
(__half*)y->data,
(__half*)x->data,
dedx->dimSize[0], dedx->dimSize[1], gold->unitNumNonZero, lossName);
}
else {
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for (int k = 0; k < blockNum; k++) {
GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
/* dE/ds_j = -gold_j + exp(y_j) */
KernelLogSoftmaxBackwardDEDS <__half, X_FLOAT16> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data + k * blockSize,
(__half*)gold->data + k * blockSize,
(__half*)y->data + k * blockSize,
(__half*)x->data + k * blockSize,
dimensionSize * stride, lossName);
}
}
if (padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum / dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else {
ShowNTErrors("TODO!");
}
}
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论