Merge with Xuchen branch.

0405663f · liyinqiao · c22e2e31 · 0405663f · 0405663f · 0405663f
Commit 0405663f authored Nov 03, 2019 by liyinqiao
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -20,7 +20,7 @@
 * This is a simple impelementation of the feed-forward network-baesd language
 * model (FNNLM). See more details about FNNLM in
 * "A Neural Probabilistic Language Model" by Bengio et al.
- * Journal of Machine Learning Research 3 (2003) 1137?155
+ * Journal of Machine Learning Research 3 (2003) 1137-1155
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
 */

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -135,6 +135,8 @@ const char * GetOPName(int type)
            return "S_SPLIT";
        else if (type == SHAPE_SPLIT_LIST)
            return "S_SPLIT_LIST";
+        else if (type == SHAPE_STACK)
+            return "S_SHAPE_STACK";
        else if (type == SHAPE_SQUEEZE)
            return "S_SQUEEZE";
        else if (type == SHAPE_TRANSPOSE)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -51,7 +51,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_MASK               MATH_DIVDIM + 1
 #define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
-#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_MAX                MATH_MATRIXMULBATCHED + 1
+#define MATH_MIN                MATH_MAX + 1
+#define MATH_MULTIPLY           MATH_MIN + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
 #define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
 #define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1
@@ -97,7 +99,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SHAPE_RESHAPE           SHAPE_PERMUTE + 1
 #define SHAPE_SPLIT             SHAPE_RESHAPE + 1
 #define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
-#define SHAPE_SQUEEZE           SHAPE_SPLIT_LIST + 1
+#define SHAPE_STACK             SHAPE_SPLIT_LIST + 1
+#define SHAPE_SQUEEZE           SHAPE_STACK + 1
 #define SHAPE_TRANSPOSE         SHAPE_SQUEEZE + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1


--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -28,6 +28,7 @@
 #ifndef __XTENSOR_H__
 #define __XTENSOR_H__

+#include <math.h>
 #include "XGlobal.h"
 #include "XMem.h"
 #include "XPRunner.h"

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -83,6 +83,7 @@
 #include "shape/Permute.h"
 #include "shape/Split.h"
 #include "shape/Squeeze.h"
+#include "shape/Stack.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
 #include "shape/IsSameShaped.h"

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XDevice.h"
 #include "../../XName.h"
 #include "../shape/IsSameShaped.h"
 #include "Compare.h"
@@ -124,4 +125,95 @@ SIMPLE_COMPARE_FUNCTION_ME(NotEqualMe, _NotEqual)
 SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
 SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)

+
+/* define three marco separately, specify the respective function names */
+#ifdef USE_CUDA
+#define _SIMPLE_MAX_MIN_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
+void _funcName(const XTensor * a, const XTensor * b,  XTensor * c)                   \
+{                                                                                    \
+    CheckNTErrors((_IsSameShaped(a, b, c)),                                          \
+                  "Input and output tensors should have the same type!");            \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
+    CheckDev(a->devID, b->devID);                                                    \
+    CheckDev(a->devID, c->devID);                                                    \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        _cudaFuncName(a, b, c);                                                      \
+        return;                                                                      \
+    }                                                                                \
+    DTYPE * da = (DTYPE*)a->data;                                                    \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    DTYPE * dc = (DTYPE*)c->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        dc[i] = (DTYPE)origFunc(da[i], db[i]);                                       \
+}     
+#else
+#define _SIMPLE_MAX_MIN_FUNCTION(_funcName, origFunc)                                \
+void _funcName(const XTensor * a, const XTensor * b, XTensor *c)                     \
+{                                                                                    \
+    CheckNTErrors((_IsSameShaped(a, b, c)),                                          \
+                  "Input and output tensors should have the same type!");            \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
+    CheckDev(a, b);                                                                  \
+    CheckDev(a, c);                                                                  \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        ShowNTErrors("No GPU devices support!")                                      \
+    }                                                                                \
+    DTYPE * da = (DTYPE*)a->data;                                                    \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    DTYPE * dc = (DTYPE*)c->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        dc[i] = (DTYPE)origFunc(da[i], db[i]);                                       \
+}     
+#endif
+                                                                                     
+#define _SIMPLE_MAX_MIN_FUNCTION_ME(_funcNameMe, _funcName)                          \
+void _funcNameMe(XTensor * a, const XTensor * b)                                     \
+{                                                                                    \
+    _funcName(a, b, a);                                                              \
+}                                                                                    
+                                                                                        
+#define SIMPLE_MAX_MIN_FUNCTION_ME(funcNameMe, _funcName)                            \
+void funcNameMe(XTensor & a, const XTensor & b)                                      \
+{                                                                                    \
+    _funcName(&a, &b, &a);                                                           \
+}                                                                                    
+                                                                                     
+#define SIMPLE_MAX_MIN_FUNCTION(funcName, _funcName, operationId)                    \
+XTensor funcName(const XTensor & a, const XTensor & b)                               \
+{                                                                                    \
+    XTensor c(&a);                                                                   \
+    c.SetTMPFlag();                                                                  \
+    _funcName(&a, &b, &c);                                                           \
+    return c;                                                                        \
+}
+                                                                                     
+#define SIMPLE_MAX_MIN_FUNCTION_VOID(funcName, _funcName, operationId)               \
+void funcName(const XTensor &a, const XTensor &b, XTensor c)                         \
+{                                                                                    \
+    if (!c.isInit || !_IsSameShaped(&a, &c)) {                                       \
+        InitTensor(&c, &a);                                                          \
+    }                                                                                \
+    _funcName(&a, &b, &c);                                                           \
+}
+
+#ifdef USE_CUDA
+_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, max)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, min)
+#else
+_SIMPLE_MAX_MIN_FUNCTION(_Max, max)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, min)
+#endif
+
+_SIMPLE_MAX_MIN_FUNCTION_ME(_MaxMe, _Max)
+SIMPLE_MAX_MIN_FUNCTION_ME(MaxMe, _Max)
+SIMPLE_MAX_MIN_FUNCTION(Max, _Max, MATH_MAX)
+SIMPLE_MAX_MIN_FUNCTION_VOID(Max, _Max, MATH_MAX)
+
+_SIMPLE_MAX_MIN_FUNCTION_ME(_MinMe, _Min)
+SIMPLE_MAX_MIN_FUNCTION_ME(MinMe, _Min)
+SIMPLE_MAX_MIN_FUNCTION(Min, _Min, MATH_MIN)
+SIMPLE_MAX_MIN_FUNCTION_VOID(Min, _Min, MATH_MIN)
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
@@ -89,6 +89,53 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number)          \
 SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
 SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)

+#define SIMPLE_MAX_MIN_FUNCTION_GPU(funcName, origFunc)                     \
+__global__                                                                  \
+void Kernel##funcName(DTYPE * a, DTYPE * b, DTYPE * c, int size)            \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        c[i] = (DTYPE)origFunc(a[i], b[i]);                                 \
+}                                                                           \
+__global__                                                                  \
+void Kernel##funcName(__half * a, __half * b, __half * c, int size)         \
+{                                                                           \
+    return;                                                                 \
+}                                                                           \
+void _Cuda##funcName(const XTensor * a, const XTensor * b, XTensor * c)     \
+{                                                                           \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((DTYPE*)a->data, (DTYPE*)b->data,                 \
+                          (DTYPE*)c->data, a->unitNum);                     \
+    }                                                                       \
+    else if (a->dataType == X_FLOAT16) {                                    \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((__half*)a->data, (__half*)b->data,               \
+                          (__half*)c->data, a->unitNum);                    \
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}    
+
+SIMPLE_MAX_MIN_FUNCTION_GPU(Max, max)
+SIMPLE_MAX_MIN_FUNCTION_GPU(Min, min)

 #endif // USE_CUDA


--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
@@ -34,6 +34,12 @@ void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (cuda version) */
 void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);

+/* return maximum of two tensor for each items (cuda version) */
+void _CudaMax(const XTensor * a, const XTensor * b, XTensor *c);
+
+/* return minimum of two tensor for each items (cuda version) */
+void _CudaMin(const XTensor * a, const XTensor * b, XTensor *c);
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -56,6 +56,36 @@ XTensor NotEqual(const XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value */
 void NotEqual(const XTensor & a, XTensor & b, DTYPE value);

+/* return maximum of two tensor for each items */
+void _Max(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* return maximum of two tensor for each items (do it on site) */
+void _MaxMe(XTensor * a, const XTensor * b);
+
+/* return maximum of two tensor for each items (do it on site) */
+void MaxMe(XTensor & a, const XTensor & b);
+
+/* return maximum of two tensor for each items (return an XTensor structure) */
+XTensor Max(const XTensor & a, const XTensor & b);
+
+/* return maximum of two tensor for each items */
+void Max(const XTensor & a, const XTensor & b, XTensor & c);
+
+/* return minimum of two tensor for each items */
+void _Min(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* return minimum of two tensor for each items (do it on site) */
+void _MinMe(XTensor * a, const XTensor * b);
+
+/* return minimum of two tensor for each items (do it on site) */
+void MinMe(XTensor & a, const XTensor & b);
+
+/* return minimum of two tensor for each items (return an XTensor structure) */
+XTensor Min(const XTensor & a, const XTensor & b);
+
+/* return minimum of two tensor for each items */
+void Min(const XTensor & a, const XTensor & b, XTensor & c);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2019-10-13
+ */
+
+#include "Stack.h"
+#include "IsSameShaped.h"
+#include "../../XUtility.h"
+#include "../../XName.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* stack small tensors into a big tensor along with a dimension */
+void _Stack(const TensorList * smalls, XTensor * t, int dim)
+{
+    dim = (dim < 0 ? t->order - 1 : dim);
+    int count = smalls->count;
+
+    CheckNTErrors(smalls != NULL, "Invalid list!");
+    CheckNTErrors(count > 0, "Empty list!");
+    CheckNTErrors(dim >= 0 && dim < t->order, "Wrong range of dim");
+    for (int i = 1; i < count; i++) {
+        XTensor * tmp1 = smalls->GetItem(i);
+        XTensor * tmp2 = smalls->GetItem(i-1);
+        CheckNTErrors(_IsSameShaped(tmp1, tmp2), "The input tensor must be same size!");
+    }
+
+    int blockSize = 1;
+    int blockNum = 1;
+    int gridSize = 1;
+    int gridNum = 1;
+
+    XTensor * smallsItem0 = smalls->GetItem(0);
+    int unitNum = smallsItem0->unitNum;
+    int unitSize = smallsItem0->unitSize;
+    int itemSize = unitNum * unitSize;
+
+    for (int i = 0; i < smallsItem0->order; i++) {
+        if (i >= dim)
+            blockSize *= smallsItem0->dimSize[i];
+        else
+            blockNum *= smallsItem0->dimSize[i];
+    }
+
+    /* merging with fewer data copy operations */
+    if (count * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
+        int sPitch = blockSize * unitSize;
+        int tPtich = blockSize * count * unitSize;
+        int mSize = blockSize * unitSize;
+        int n = blockNum;
+        int sStep = 0;
+        int tStep = blockSize * unitSize;
+        char * tData = (char*)t->data;
+        for (int k = 0; k < count; k++) {
+            XTensor * s = smalls->GetItem(k);
+            char * sData = (char*)s->data;
+            XMemCopy2D(tData + k * tStep, tPtich, t->devID,
+                       sData + k * sStep, sPitch, s->devID,
+                       mSize, n);
+        }
+    }
+    else {
+        ShowNTErrors("TO DO!!!");
+    }
+}
+
+/* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */
+XTensor Stack(const TensorList &smalls, int dim)
+{
+    int count = smalls.count;
+    CheckNTErrors(count > 0, "Empty list!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+
+    XTensor * tensor = smalls.GetItem(0);
+    int order = tensor->order + 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        if (i < dim)
+            dimSize[i] = tensor->GetDim(i);
+        else if (i > dim)
+            dimSize[i] = tensor->GetDim(i);
+        else if (i == dim)
+            dimSize[i] = count;
+    }
+
+    float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+    XTensor t(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+    t.SetTMPFlag();
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    /* call _Stack function */
+    _Stack(&smalls, &t, dim);
+                
+    /* tensor connection */
+    for (int i = 0; i < count; i++) {
+        XTensor * tmp = smalls.GetItem(i);
+        if (tmp->enableGrad == false)
+            return t;
+    }
+
+    XLink::MakeLink(&smalls, &t, SHAPE_STACK);
+    XLink::AddParamToHeadInt(&t, dim);
+
+    return t;
+}
+
+/* check the shape of target tensor */
+bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
+{
+    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    int order = tensor->order;
+
+    for (int i = 0; i < tensor->order; i++) {
+        if (i < dim)
+            if (t.GetDim(i) != tensor->GetDim(i)) 
+                return false;
+        else if (i > dim)
+            if (t.GetDim(i) != tensor->GetDim(i-1)) 
+                return false;
+        else if (i == dim)
+            if (t.GetDim(i) != smalls.count) 
+                return false;
+    }
+
+    return true;
+}
+
+/* stack small tensors into a big tensor along with a dimension */
+void Stack(const TensorList &smalls, XTensor &t, int dim)
+{
+    int count = smalls.count;
+    CheckNTErrors(count > 0, "Empty list!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+
+    if (!t.isInit || !CheckStackShape(smalls, t, dim)) {
+        XTensor * tensor = smalls.GetItem(0);
+        int order = tensor->order + 1;
+        int * dimSize = new int[order];
+
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = tensor->GetDim(i);
+            else if (i > dim)
+                dimSize[i] = tensor->GetDim(i-1);
+            else if (i == dim)
+                dimSize[i] = count;
+        }
+
+        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+        InitTensorV2(&t, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _Stack function */
+    _Stack(&smalls, &t, dim);
+                
+    /* tensor connection */
+    for (int i = 0; i < count; i++) {
+        XTensor * tmp = smalls.GetItem(i);
+        if (tmp->enableGrad == false)
+            return;
+    }
+
+    XLink::MakeLink(&smalls, &t, SHAPE_STACK);
+    XLink::AddParamToHeadInt(&t, dim);
+}
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/shape/Stack.h
+++ b/source/tensor/core/shape/Stack.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2019-10-13
+ * It's so cold outside. It's too hard for me to get out.
+ */
+
+#ifndef __STACK_H__
+#define __STACK_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* stack small tensors into a big tensor along with a dimension */
+void _Stack(const TensorList * smalls, XTensor * t, int dim);
+
+/* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */
+XTensor Stack(const TensorList &list, int leadingDim);
+
+/* stack small tensors into a big tensor along with a dimension */
+void Stack(const TensorList &smalls, XTensor &t, int dim);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __STACK_H__
\ No newline at end of file