Commit 6a3d713a by linye

no message

parent 30217de4
......@@ -169,6 +169,35 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
ShowNTErrors("Something is wrong!");
}
}
if (a->dataType == X_FLOAT16) {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == 0.0F)
KernelMultiplyWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
else
KernelMultiplyWithCol<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == 0.0F)
KernelMultiplyWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
else
KernelMultiplyWithRow<__half, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((__half*)a->data, (__half*)b->data, (__half*)c->data,
blockNum, blockSize, alpha1);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
......
......@@ -33,8 +33,9 @@ set each entry to its negtive value (CUDA Kernel)
>> b - pointer to the output data array
>> size - size of the data array
*/
template <class T>
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size)
void KernelNegate(T * a, T * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -42,26 +43,26 @@ void KernelNegate(DTYPE * a, DTYPE * b, int size)
b[i] = -a[i];
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
b[i] = __hsub(__float2half(0), a[i]);
#else
if (i < size)
b[i] = __float2half(-__half2float(a[i]));
#endif
}
///*
//set each entry to its negtive value (CUDA Kernel)
//This is for float16 computation
//>> a - pointer to the input data array
//>> b - pointer to the output data array
//>> size - size of the data array
//*/
//__global__
//void KernelNegate(__half * a, __half * b, int size)
//{
// int i = blockDim.x * blockIdx.x + threadIdx.x;
//
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
// if (i < size)
// b[i] = __hsub(__float2half(0), a[i]);
//#else
// if (i < size)
// b[i] = __float2half(-__half2float(a[i]));
//#endif
//}
/*
set each entry to its negtive value
......
......@@ -238,8 +238,9 @@ set dE/dx = exp(y)
>> size - size of output
>> lossName - name of the loss function
*/
template <class T>
__global__
void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName)
void KernelExpLoss(T * dedy, T * dedx, T * y, int size, LOSS_FUNCTION_NAME lossName)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -270,36 +271,59 @@ dE/dx = dE/dy * dy/dx
>> size - size of input/output
>> lossName - name of the loss function
*/
template <class T, TENSOR_DATA_TYPE dataType>
__global__
void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x,
void KernelLogSoftmaxBackwardDEDS(T * dedy, T * dedx, T * gold, T * y, T * x,
int size, LOSS_FUNCTION_NAME lossName)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
DTYPE r = 0;
/* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY)
r = -gold[i] + exp(y[i]);
/* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
r = -gold[i] + exp(y[i]);
else if (lossName == ONEHOTERROR) {
if (gold[i] == 1.0F)
r = -gold[i] + exp(y[i]);
else
if (dataType == X_FLOAT) {
DTYPE r = 0;
/* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
/* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
else if (lossName == ONEHOTERROR) {
if ((DTYPE)gold[i] == 1.0)
r = -(DTYPE)gold[i] + (DTYPE)exp(y[i]);
else
r = 0;
}
else {
r = dedy[i];
}
if (isnan(r))
r = 0;
if (isinf(r))
r = 0;
}
else {
r = dedy[i];
}
if (isnan(r))
r = 0;
if (isinf(r))
r = 0;
dedx[i] = r;
}
else if (dataType == X_FLOAT16) {
half r = 0;
/* dE/ds_j = exp(y_j) */
if (lossName == CROSSENTROPY)
r = -(half)gold[i] + (half)hexp(y[i]);
/* dE/ds_j = exp(y_j) */
else if (lossName == SQUAREDERROR)
r = -(half)gold[i] + (half)hexp(y[i]);
else if (lossName == ONEHOTERROR) {
if ((half)gold[i] == (half)1.0)
r = -(half)gold[i] + (half)hexp(y[i]);
else
r = 0;
}
else {
r = dedy[i];
}
dedx[i] = r;
dedx[i] = r;
}
}
}
......@@ -320,11 +344,12 @@ dE/dx_j += -gold_j
>> gNonZeroNum -
>> lossName - name of the loss function
*/
template <class T>
__global__
void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void * gold, DTYPE * y, DTYPE * x,
void KernelLogSoftmaxBackwardDEDSSparseByRow(T * dedy, T * dedx, void * gold, T * y, T * x,
int rowNum, int colNum, int gNonZeroNum, LOSS_FUNCTION_NAME lossName)
{
int tupleSize = sizeof(int) + sizeof(DTYPE);
int tupleSize = sizeof(int) + sizeof(T);
int k = blockDim.x * blockIdx.x + threadIdx.x;
if (k < gNonZeroNum) {
......@@ -332,7 +357,7 @@ void KernelLogSoftmaxBackwardDEDSSparseByRow(DTYPE * dedy, DTYPE * dedx, void *
int key = *(int*)((char*)gold + tupleSize * k);
int ni = key / colNum;
int mi = key % colNum;
int value = *(DTYPE*)((char*)gold + tupleSize * k + sizeof(int));
int value = *(T*)((char*)gold + tupleSize * k + sizeof(int));
if (lossName == CROSSENTROPY)
dedx[colNum * ni + mi] += -value;
......@@ -421,6 +446,8 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
CheckNTErrors((x->devID == y->devID && gold->devID == y->devID),
"Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
"Unknown loss function.");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
......@@ -435,10 +462,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
CheckNTErrors((lossName == CROSSENTROPY || lossName == SQUAREDERROR || lossName == NOLOSS),
"Unknown loss function.");
if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
int cudaGridSize[3], cudaBlockSize[3];
......@@ -449,7 +473,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);
/* dE/ds_j = exp(y_j) */
KernelExpLoss <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
KernelExpLoss <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(DTYPE*)dedx->data,
(DTYPE*)y->data,
......@@ -459,7 +483,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);
/* dE/ds_j += -gold_j */
KernelLogSoftmaxBackwardDEDSSparseByRow <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
KernelLogSoftmaxBackwardDEDSSparseByRow <DTYPE> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(DTYPE*)dedx->data,
(char*)gold->data + sizeof(int),
......@@ -474,7 +498,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
/* dE/ds_j = -gold_j + exp(y_j) */
KernelLogSoftmaxBackwardDEDS <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
KernelLogSoftmaxBackwardDEDS <DTYPE, X_FLOAT> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(DTYPE*)dedx->data + k * blockSize,
(DTYPE*)gold->data + k * blockSize,
......@@ -508,6 +532,76 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
ShowNTErrors("TODO!");
}
}
else if (x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16) {
int cudaGridSize[3], cudaBlockSize[3];
if (lossName == CROSSENTROPY || lossName == SQUAREDERROR) {
if (gold->isSparse) {
CheckNTErrors((gold->order == 2), "TODO!")
CheckNTErrors((leadDim == 0), "TODO!");
GDevs.GetCudaThread(x->devID, x->unitNum, cudaGridSize, cudaBlockSize);
/* dE/ds_j = exp(y_j) */
KernelExpLoss <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data,
(__half*)y->data,
dimensionSize * stride,
lossName);
GDevs.GetCudaThread(x->devID, gold->unitNumNonZero, cudaGridSize, cudaBlockSize);
/* dE/ds_j += -gold_j */
KernelLogSoftmaxBackwardDEDSSparseByRow <__half> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data,
(char*)gold->data + sizeof(int),
(__half*)y->data,
(__half*)x->data,
dedx->dimSize[0], dedx->dimSize[1], gold->unitNumNonZero, lossName);
}
else {
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for (int k = 0; k < blockNum; k++) {
GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
/* dE/ds_j = -gold_j + exp(y_j) */
KernelLogSoftmaxBackwardDEDS <__half, X_FLOAT16> <<< dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >>>
(NULL,
(__half*)dedx->data + k * blockSize,
(__half*)gold->data + k * blockSize,
(__half*)y->data + k * blockSize,
(__half*)x->data + k * blockSize,
dimensionSize * stride, lossName);
}
}
if (padding != NULL) {
int n = leadDim;
int paddingOrder = padding->order;
int * paddingDims = new int[paddingOrder];
memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
padding->Reshape(padding->unitNum);
int order = dedx->order;
int * dims = new int[order];
memcpy(dims, dedx->dimSize, dedx->order * sizeof(int));
dedx->Reshape(dedx->unitNum / dedx->GetDim(n), dedx->GetDim(n));
_MultiplyDimMe(dedx, padding, 0);
padding->Reshape(paddingOrder, paddingDims);
dedx->Reshape(order, dims);
delete[] paddingDims;
delete[] dims;
}
}
else {
ShowNTErrors("TODO!");
}
}
else{
ShowNTErrors("TODO!");
}
......
......@@ -84,7 +84,7 @@ bool Test()
//wrong = !TestDropout() || wrong;
//wrong = !TestHardTanH() || wrong;
//wrong = !TestIdentity() || wrong;
//wrong = !TestLogSoftmax() || wrong;
wrong = !TestLogSoftmax() || wrong;
//wrong = !TestLoss() || wrong;
//wrong = !TestRectify() || wrong;
//wrong = !TestSigmoid() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论