Clean codes.

42543c27 · liyinqiao · eb325d83 · 42543c27 · 42543c27 · eb325d83
Commit 42543c27 authored Feb 11, 2020 by liyinqiao
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -37,7 +37,6 @@
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
 #include "arithmetic/Sub.h"
-#include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"

--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
@@ -23,7 +23,6 @@

 #include "../../XDevice.h"
 #include "../../XUtility.h"
-#include "Sub.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -39,7 +38,7 @@ c = a - b * \beta
 >> alpha - value
 */
 __global__
-    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;


--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-
-#include "../../XDevice.h"
-#include "../../XUtility.h"
-#include "Sub.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-subtraction of data arrays (CUDA Kernel)
-c = a - b * \beta
->> a - A matrix
->> b - another matrix
->> c - where we put a-b
->> size - the size of a/b/c
->> beta - the coefficient
-*/
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] - b[i] * beta;
-}
-
-/*
-tensor subtraction c = a - b * \beta (cuda version)
->> a - a tensor
->> b - another tensor
->> c - where we put a-b*\beta.
->> beta - the scaling factor
-*/
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                  "The tensors must be on the same!");
-
-    int devIDBackup = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(a->devID);
-
-    if (!a->isSparse && !b->isSparse) {
-        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
-
-        if (a->dataType == DEFAULT_DTYPE &&
-            b->dataType == DEFAULT_DTYPE &&
-            c->dataType == DEFAULT_DTYPE)
-        {
-            int gridSize[3], blockSize[3];
-
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-    else {
-        // TODO!!
-        ShowNTErrors("TODO!");
-    }
-
-    XDevice::SetGPUDevice(devIDBackup);
-}
-
-/* subtraction over arrays
-tensor subtraction c = a - b * \beta (cuda version) with an input handle
->> devID - device ID (MUST >= 0)
->> handle - cuda handle
->> a - an array
->> b - another array
->> c - where we put a-b
->> size - size of the array
->> beta - the coefficient
-*/
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    if (size == 0)
-        return;
-
-    if (c == NULL)
-        c = a;
-
-    CheckNTErrors((a && b && c), "Empty arrays in addition!");
-
-    int devIDBackup;
-    ProtectCudaDev(devID, devIDBackup);
-
-    if (c == a) {
-#ifdef DOUBELPRICSION
-        cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
-#else
-        cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
-#endif
-    }
-    else {
-        int gridSize[3], blockSize[3];
-
-        GDevs.GetCudaThread(devID, size, gridSize, blockSize);
-
-        dim3 blocks(gridSize[0]);
-        dim3 threads(blockSize[0]);
-
-        KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
-    }
-
-    BacktoCudaDev(devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-
-#ifndef __SUB_CUH__
-#define __SUB_CUH__
-
-#include "Sub.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* subtraction of data arrays (CUDA Kernel) */
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-
-/* tensor subtraction c = a - b * \beta (cuda version) */
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-
-/*  tensor subtraction c = a - b * \beta (cuda version) with an input handle */
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUB_CUH__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#include <math.h>
-#include "Sub.h"
-#include "SubDim.h"
-#include "SubDim.cuh"
-#include "../../XName.h"
-#include "../../XUtility.h"
-#include "../movement/CopyValues.h"
-#include "../shape/IsSameShaped.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-tensor subtraction
-
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    n = MODX(n, a->order);
-
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-
-    CheckDev(a->devID, b->devID);
-
-    if (beta == 0) {
-        _CopyValues(a, c);
-        return;
-    }
-
-    if (_IsSameShaped(a, b)) {
-        _Sub(a, b, c, beta);
-        return;
-    }
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSubDim(a, b, c, n, beta);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else {
-        int stride = 1;
-        int blockSize = a->dimSize[n];
-        int blockNum = 1;
-
-        for (int i = a->order - 1; i >= 0; i--) {
-            if (i > n)
-                stride *= a->dimSize[i];
-            else if (i < n)
-                blockNum *= a->dimSize[i];
-        }
-
-        if (a->dataType == DEFAULT_DTYPE) {
-            int num = a->unitNum;
-            if (stride > 1) {
-                for (int i = 0, j = 0; i < num; i += stride, j++) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    for (int k = 0; k < stride; k++)
-                        cp[k] = ap[k] - bv;
-                }
-            }
-            else if (stride == 1) {
-                DTYPE * bp = (DTYPE*)b->data;
-                for (int i = 0; i < num; i += blockSize) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    if (beta == 1.0F) {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j];
-                    }
-                    else {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j] * beta;
-                    }
-                }
-            }
-            else {
-                ShowNTErrors("Something is wrong!");
-            }
-        }
-        else {
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-
-/*
-tensor subtraction (do it on site)
-keep the result in the input tensor and return nothing
-
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
-{
-    _SubDim(a, b, a, n, beta);
-}
-
-/*
-tensor subtraction (return an XTensor structure and make tensor connections)
-make a new tensor to keep the result and return it
-
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-<< return - the result tensor by tensor subtraction
-*/
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
-{
-    XTensor c(&a);
-    c.SetTMPFlag();
-
-    n = MODX(n, a.order);
-
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-
-    /* tensor connections */
-    if (a.enableGrad && b.enableGrad) {
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-
-    return c;
-}
-
-/*
-tensor subtraction
-
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
-{
-    if (!c.isInit || !IsSameShaped(a, c)) {
-        InitTensorV2(&c, &a);
-    }
-
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-
-    if (a.enableGrad && b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-}
-
-}
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#include "SubDim.cuh"
-#include "../../XDevice.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-tensor subtraction of a tensor and a row vector
-c = a - b * \beta
-where a is a tensor and b is a row vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c
->> colNum - number of columns of a and c (i.e., the size of b)
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int col = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (col >= colNum || row >= rowNum)
-        return;
-
-    if (threadIdx.y == 0)
-        bv[threadIdx.x] = b[col];
-
-    __syncthreads();
-
-    int offset = colNum * row + col;
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.x] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.x];
-}
-
-/*
-tensor subtraction of a tensor and a colum vector
-c = a - b * \beta
-where a is a tensor and b is a colum vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c (i.e., the size of b)
->> colNum - number of columns of a and c
->> blockNum - size of a block (matrix), i.e., rowNum * colNum
->> blockNum - number of matrics
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-
-    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-    int col = colIndex % colNum;
-    int block = colIndex / colNum;
-
-    if (row >= rowNum || block >= blockNum)
-        return;
-
-    if (threadIdx.x == 0)
-        bv[threadIdx.y] = b[row];
-
-    __syncthreads();
-
-    int offset = block * blockSize + row * colNum + col;
-
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.y] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.y];
-}
-
-/*
-tensor subtraction (cuda version)
-
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a+b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-
-    int stride = 1;
-    int blockSize = a->dimSize[n];
-    int blockNum = 1;
-
-    for (int i = a->order - 1; i >= 0; i--) {
-        if (i > n)
-            stride *= a->dimSize[i];
-        else if (i < n)
-            blockNum *= a->dimSize[i];
-    }
-
-    int cudaGrids[3];
-    int cudaBlocks[3];
-
-    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        if (stride > 1) {
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-            else
-                KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-        }
-        else if (stride == 1) {
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-            else
-                KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif
-
-} // namespace nts(NiuTrans.Tensor)
-
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#ifndef __SUBDIM_CUH__
-#define __SUBDIM_CUH__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting (cuda version) */
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-
-#endif
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUBDIM_CUH__
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#ifndef __SUBDIM_H__
-#define __SUBDIM_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
-
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
-
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUBDIM_H__
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#include "../core/utilities/CheckData.h"
-#include "../core/arithmetic/SubDim.h"
-#include "../XTensor.h"
-#include "TSubDim.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-case 1: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim1()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2) */
-    int bOrder = 1;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2] = {1.0F, -1.0F};
-    DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
-                           {5.0F, 6.0F, 7.0F, 8.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-
-    /* call SubDim function */
-    _SubDim(a, b, c, 0);
-    _SubDim(cMe, b, 0);
-    cUser = SubDim(*a, *b, 0);
-
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 0);
-    _SubDim(cMeGPU, bGPU, 0);
-    cUserGPU = SubDim(*aGPU, *bGPU, 0);
-
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/*
-case 2: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim2()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 2;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][2] = { {1.0F, -1.0F},
-                          {-1.0F, 1.0F} };
-    DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
-                           {3.0F, 6.0F, 7.0F, 6.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-
-    /* call SubDim function */
-    _SubDim(a, b, c, 1);
-    _SubDim(cMe, b, 1);
-    cUser = SubDim(*a, *b, 1);
-
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 1);
-    _SubDim(cMeGPU, bGPU, 1);
-    cUserGPU = SubDim(*aGPU, *bGPU, 1);
-
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* other cases */
-/*
-TODO!!
-*/
-
-/* test for SubDim Function */
-bool TestSubDim()
-{
-    XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
-    bool returnFlag = true, caseFlag = true;
-
-    /* case 1 test */
-    caseFlag = TestSubDim1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-
-    /* case 2 test */
-    caseFlag = TestSubDim2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-
-    /* other cases test */
-    /*
-    TODO!!
-    */
-
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-
-    XPRINT(0, stdout, "\n");
-
-    return returnFlag;
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-
-#ifndef __TEST_SUBDIM_H__
-#define __TEST_SUBDIM_H__
-
-#include "../core/arithmetic/SubDim.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* test for SubDim Function */
-bool TestSubDim();
-
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUBDIM_H__