Commit 229db8c6 by xiaotong

improve the implementation of Unsqueeze

parent fe90b454
......@@ -1323,8 +1323,8 @@ gradient for reduceSumSquared
for
c = \sum_i (a_i - b)^2
we have
dE/da = Unsqueeze(dE/dc) * 2a
dE/db = dE/dc * -2 * n * b
dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b)
dE/db = dE/dc * -2 * n * \sum_i (a_i - b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
......@@ -1352,12 +1352,12 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2(a-b) */
/* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) */
_ScaleAndShiftMe(d, 2.0F);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(d, e, a->grad, 1.0F);
/* dE/db = dE/dc * -2 * (a-b*n) */
/* dE/db = dE/dc * -2 * n * \sum_i (a_i - b) */
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
......@@ -1375,8 +1375,8 @@ for
c = (sum_i (a_i - b)^2) * 1/n
where b is the mean, and n is the size of a
we have
dE/da = Unsqueeze(dE/dc) * 2a/n
dE/db = dE/dc * -2 * b
dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b)/n
dE/db = dE/dc * -2 * \sum_i (a_i - b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
......@@ -1404,12 +1404,12 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2 (a-b) / n */
/* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) / n */
_ScaleAndShiftMe(d, 2.0F / n);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(d, e, a->grad, 1.0F);
/* dE/db = dE/dc * -2 * (a-b) */
/* dE/db = dE/dc * -2 * \sum_i (a_i - b) */
_ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F);
......
......@@ -127,7 +127,7 @@ insert a dimension by copying the blocks for n times (where n is the size of the
>> s - pointer to the source data array
>> blockSize - size of a block
>> blockNum - number of the blocks
>> totalSize - total size of the blocks (i.e., blockSIze * n)
>> totalSize - total size of the blocks (i.e., blockSize * n)
>> t - pointer to the target data array
>> n - number of blocks to copy data
*/
......@@ -155,6 +155,75 @@ void KernelUnsqueeze(void * s, int blockSize, int blockNum, int totalSize, void
}
/*
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
This is special case where we actually copy a v-dimentional column vector by n times to form a v * n matrix
>> s - pointer to the source data array
>> rowNum - number of rows (i.e., dimension size of s)
>> colNum - number of columns (i.e., number of copies)
>> t - pointer to the target data array
*/
template<class T>
__global__
void KernelUnsqueezeByCol(void * s, int rowNum, int colNum, void * t)
{
__shared__ T values[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ T * ts[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* column index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* row index */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= colNum || j >= rowNum)
return;
if(threadIdx.x == 0){
values[threadIdx.y] = ((T*)s)[j];
ts[threadIdx.y] = (T*)t + colNum * j;
}
__syncthreads();
ts[threadIdx.y][i] = values[threadIdx.y];
}
/*
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
This is special case where we actually copy a v-dimentional column vector by n times to form a v * n matrix
And a row is very big so that it occupies the cuda threads in a block
>> s - pointer to the source data array
>> rowNum - number of rows (i.e., dimension size of s)
>> colNum - number of columns (i.e., number of copies)
>> t - pointer to the target data array
*/
template<class T>
__global__
void KernelUnsqueezeByColBigRow(void * s, int rowNum, int colNum, void * t)
{
__shared__ T value;
__shared__ T * tData;
/* column index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* row index */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= colNum || j >= rowNum)
return;
if (threadIdx.x == 0) {
value = ((T*)s)[j];
tData = (T*)t + colNum * j;
}
__syncthreads();
tData[i] = value;
}
/*
insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
>> a - input tensor
>> b - output tensor
......@@ -181,14 +250,39 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if(blockNumA > 1){
if (dimRDI == 0) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
if (cudaBlocks[1] == 1)
KernelUnsqueezeByColBigRow<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
else
KernelUnsqueezeByCol<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
}
else if (a->dataType == X_INT && b->dataType == X_INT) {
if (cudaBlocks[1] == 1)
KernelUnsqueezeByColBigRow<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
else
KernelUnsqueezeByCol<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockNumA, dSize, b->data);
}
else {
ShowNTErrors("TODO!");
}
}
else if(blockNumA > 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
else if (a->dataType == X_INT && b->dataType == X_INT) {
KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
}
......@@ -199,11 +293,11 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
else if(blockNumA == 1 && blockSize < MAX_CUDA_THREAD_NUM_PER_BLOCK){
GDevs.GetCudaThread2D(a->devID, blockSize, dSize, MAX_CUDA_THREAD_NUM_PER_BLOCK/4, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
KernelUnsqueezeFlat2D<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
else if (a->dataType == X_INT && b->dataType == X_INT) {
KernelUnsqueezeFlat2D<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
......@@ -214,11 +308,11 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
else if(blockNumA == 1 && blockSize % 2 == 0){
GDevs.GetCudaThread(a->devID, blockSize/2, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
KernelUnsqueezeFlatBigram<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
else if (a->dataType == X_INT && b->dataType == X_INT) {
KernelUnsqueezeFlatBigram<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
......@@ -229,11 +323,11 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
else if(blockNumA == 1){
GDevs.GetCudaThread(a->devID, blockSize, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
KernelUnsqueezeFlat<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
else if (a->dataType == X_INT && b->dataType == X_INT) {
KernelUnsqueezeFlat<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论