rename SumFilled as SumDim and add the new code

454bd870 · xiaotong · dcba416c · 454bd870 · 454bd870 · 454bd870
Commit 454bd870 authored Jul 29, 2018 by xiaotong
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -31,6 +31,7 @@
 //#include <crtdbg.h>

 void TransposeTest();
+void SumDimTest();

 using namespace nts;
 using namespace samplefnnlm;
@@ -40,6 +41,9 @@ int main( int argc, const char ** argv )
    //TransposeTest();
    //return 0;

+    SumDimTest();
+    return 0;
+
    if(argc > 1 && !strcmp(argv[1], "-test"))
        1;//Test();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
@@ -94,6 +98,7 @@ void TransposeTest()
    //XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
    XTensor x;
    XTensor y;
+    XTensor z;

    int loops = 2000;

@@ -106,19 +111,25 @@ void TransposeTest()

    InitTensor3D(&x, B, N, H, X_FLOAT, 0);
    InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
+    InitTensor3D(&z, B, N, H, X_FLOAT, 0);

    cudaEvent_t ctime0;
    cudaEvent_t ctime1;
    cudaEvent_t ctime2;
    cudaEvent_t ctime3;
+    cudaEvent_t ctime4;
+    cudaEvent_t ctime5;

    float elapsedSplit = 0.0;
    float elapsedMerge = 0.0;
+    float elapsedSum = 0.0;

    cudaEventCreate(&ctime0);
    cudaEventCreate(&ctime1);
    cudaEventCreate(&ctime2);
    cudaEventCreate(&ctime3);
+    cudaEventCreate(&ctime4);
+    cudaEventCreate(&ctime5);

    cudaEventRecord(ctime0, 0);

@@ -142,7 +153,57 @@ void TransposeTest()
    cudaEventSynchronize(ctime3);
    cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);

-    fprintf(stderr, "split:%f merge:%f\n", time1 - time0, time3 - time2);
-    fprintf(stderr, "split:%f merge:%f\n", elapsedSplit, elapsedMerge);
+    cudaEventRecord(ctime4, 0);
+
+    double time4 = GetClock();
+    for(int i = 0; i < loops; i++)
+        _Sum(&x, &z, &x);
+    double time5 = GetClock();
+
+    cudaEventRecord(ctime5, 0);
+    cudaEventSynchronize(ctime5);
+    cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
+
+    fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
+    fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
 #endif
 }
+
+void SumDimTest()
+{
+    XTensor x;
+    XTensor y;
+    XTensor z;
+
+    int loops = 2000;
+
+    int a = 5;
+    int b = 7;
+    int c = 3;
+
+    XDevice::SetGPUDevice(0);
+
+    InitTensor3D(&x, a, b, c, X_FLOAT, 0);
+    InitTensor1D(&y, c, X_FLOAT, 0);
+    InitTensor3D(&z, a, b, c, X_FLOAT, 0);
+
+    x.SetZeroAll();
+    y.SetZeroAll();
+    z.SetZeroAll();
+
+    float * data = new float[x.unitNum];
+
+    for(int i = 0; i < x.unitNum; i++)
+        data[i] = (DTYPE)i;
+    x.SetData(data, x.unitNum);
+
+    for(int i = 0; i < y.unitNum; i++)
+        data[i] = -(DTYPE)i;
+    y.SetData(data, y.unitNum);
+
+    _SumDim(&x, &y, &z, 2);
+
+    z.Dump(stderr, "z:");
+
+    delete[] data;
+}
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -45,7 +45,7 @@ namespace nts{
 struct XLink;

 /* define the maximum number of dimensions in a tensor */
-#define MAX_TENSOR_DIM_NUM 6
+#define MAX_TENSOR_DIM_NUM 8
 #define USE_BATCHED_STRIDED_MAT_MUL
 #define MIN_TENSOR_SPLIT_NUM 0
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -175,29 +175,38 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
        return;
    }
 #ifdef USE_CUDA
-    else if(devIDT >= 0 && devIDS < 0){
-        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
-        }
-    }
-    else if(devIDT < 0 && devIDS >= 0){
-        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
-        }
-    }
    else{
-        //if(devIDT == devIDS){
-            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToDevice);
+        int devID = devIDT < 0 ? devIDS : devIDT;
+        int devIDBackup = 0;
+        cudaGetDevice(&devIDBackup);
+        cudaSetDevice(devID);
+
+        if(devIDT >= 0 && devIDS < 0){
+            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
+            }
+        }
+        else if(devIDT < 0 && devIDS >= 0){
+            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
            }
-        /*}
+        }
        else{
-            CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
-                                "cudaMemcpy error (cudaMemcpyDeviceToDevice)");
-        }*/
+            //if(devIDT == devIDS){
+                cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToDevice);
+                if(error != cudaSuccess){
+                    ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+                }
+            /*}
+            else{
+                CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
+                                    "cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }*/
+        }
+
+        cudaSetDevice(devIDBackup);
    }
 #else
    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
@@ -270,23 +279,32 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
        return;
    }
 #ifdef USE_CUDA
-    else if (devIDT >= 0 && devIDS < 0) {
-        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+    else{
+        int devID = devIDT < 0 ? devIDS : devIDT;
+        int devIDBackup = 0;
+        cudaGetDevice(&devIDBackup);
+        cudaSetDevice(devID);
+
+        if (devIDT >= 0 && devIDS < 0) {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+            }
        }
-    }
-    else if (devIDT < 0 && devIDS >= 0) {
-        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        else if (devIDT < 0 && devIDS >= 0) {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+            }
        }
-    }
-    else {
-        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);
-        if (error != cudaSuccess) {
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+        else {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);
+            if (error != cudaSuccess) {
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }
        }
+
+        cudaSetDevice(devIDBackup);
    }
 #else
    ShowNTErrors("Please specify USE_CUDA and recompile the code!");

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -64,7 +64,7 @@
 #include "arithmetic/Sum.h"
 #include "arithmetic/SumByColumnTV.h"
 #include "arithmetic/SumByColumnVT.h"
-#include "arithmetic/SumFilled.h"
+#include "arithmetic/SumDim.h"
 #include "sort/TopK.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -67,7 +67,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    }
    else {
        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
    
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+ */
+
+#include "Sum.h"
+#include "SumDim.h"
+#include "SumDim.cuh"
+#include "../movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor summation 
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in addition!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    if(beta == 0){
+        _CopyValues(a, c);
+        return;
+    }
+
+    if(XTensor::IsSameShaped(a, b)){
+        _Sum(a, b, c, beta);
+        return;
+    }
+
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+        _CudaSumDim(a, b, c, n, beta);
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+    
+        if (a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap =   (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
+                    DTYPE * cp =   (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++)
+                        cp[k] = ap[k] + bv;
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(beta == 1.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] + bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] + bp[j] * beta;
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+    
+}
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+*/
+
+#include "SumDim.cuh"
+#include "../../XDevice.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+tensor summation of a tensor and a row vector
+c = a + b * \beta 
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(col >= colNum || row >= rowNum)
+        return;
+
+    if(threadIdx.y == 0)
+        bv[threadIdx.x] =  b[col];
+
+    __syncthreads();
+
+    int offset = colNum * row + col;
+    if(betaFired)
+        c[offset] = a[offset] + bv[threadIdx.x] * beta;
+    else
+        c[offset] = a[offset] + bv[threadIdx.x];
+}
+
+/* 
+tensor summation of a tensor and a colum vector
+c = a + b * \beta 
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c 
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics 
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;
+
+    if(row >= rowNum || block >= blockNum)
+        return;
+
+    if(threadIdx.x == 0)
+        bv[threadIdx.y] =  b[row];
+
+    __syncthreads();
+
+    int offset = block * blockSize + row * colNum + col;
+    
+    if(betaFired)
+        c[offset] = a[offset] + bv[threadIdx.y] * beta;
+    else
+        c[offset] = a[offset] + bv[threadIdx.y];
+}
+
+/*
+tensor summation (cuda version)
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in addition!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+
+    for(int i = a->order - 1; i >= 0; i--){
+        if(i > n)
+            stride *= a->dimSize[i];
+        else if(i < n)
+            blockNum *= a->dimSize[i];
+    }
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    if (a->dataType == DEFAULT_DTYPE){
+        if(stride > 1){
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if(beta == (DTYPE)1.0F)
+                KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+            else
+                KernelAddWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+        }
+        else if(stride == 1){
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if(beta == (DTYPE)1.0F)
+                KernelAddWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, beta);
+            else
+                KernelAddWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, beta);
+        }
+        else{
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+}
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/arithmetic/SumFilled.cu
+++ b/source/tensor/core/arithmetic/SumFilled.cu
@@ -16,13 +16,20 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
 */

-#include "SumFilled.cuh"
+#ifndef __SUMDIM_CUH__
+#define __SUMDIM_CUH__
+
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting (cuda version) */
+void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)

+#endif // __SUMDIM_CUH__
--- a/source/tensor/core/arithmetic/SumFilled.cuh
+++ b/source/tensor/core/arithmetic/SumFilled.cuh
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+ * It reached to 39 centigrade around 3:00 pm in Shenyang
+ */

-#ifndef __SUMFILLED_CUH__
-#define __SUMFILLED_CUH__
+#ifndef __SUMDIM_H__
+#define __SUMDIM_H__

 #include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
-   a value of 1, i.e., a is summed with b by broadcasting */
-void _CudaSumFilled(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting */
+void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting. we keep the result in the input tensor a and return nothing */
+void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
+XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+    
 } // namespace nts(NiuTrans.Tensor)

-#endif // __SUMFILLED_CUH__
+#endif // __SUMDIM_H__
--- a/source/tensor/core/arithmetic/SumFilled.cpp
+++ b/source/tensor/core/arithmetic/SumFilled.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
- */
-
-#include "SumFilled.h"
-#include "SumFilled.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-    
-}
--- a/source/tensor/core/arithmetic/SumFilled.h
+++ b/source/tensor/core/arithmetic/SumFilled.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
- */
-
-#ifndef __SUMFILLED_H__
-#define __SUMFILLED_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-    
-    /* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
-       a value of 1, i.e., a is summed with b by broadcasting */
-    void _SumFilled(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-    
-    /* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
-       a value of 1, i.e., a is summed with b by broadcasting
-       keep the result in the input tensor a and return nothing */
-    void _SumFilledMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
-    
-    /* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
-       a value of 1, i.e., a is summed with b by broadcasting
-       make a new tensor c to keep the result and return it */
-    XTensor SumFilled(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
-    
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMFILLED_H__