opimize the float16 CPU implementation

a89ee126 · xuchen · 89ad96e6 · a89ee126 · a89ee126 · a89ee126
Commit a89ee126 authored Sep 06, 2020 by xuchen
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -36,11 +36,65 @@ using namespace nts;
 using namespace fnnlm;
 using namespace transformer;

+int MyTest()
+{
+    float16 x;
+    printf("%f\n", x.Float());
+
+    x = 3.5;
+    printf("%f\n", x.Float());
+    
+    x = 0.0F;
+    printf("%f\n", x.Float());
+    x.Dump();
+
+    x = -3.5;
+    printf("%f\n", x.Float());
+    printf("%d\n", sizeof(float16));
+
+    FILE* f = fopen("test_fp16", "w");
+    fwrite(&x, sizeof(float16), 1, f);
+    fclose(f);
+
+    FILE* f2 = fopen("test_fp16", "r");
+    fread(&x, sizeof(float16), 1, f2);
+    fclose(f2);
+
+    printf("%f\n", x.Float());
+
+    return 0;
+}
+
+int MyTest2()
+{
+    GDevs.Init();
+    GDevs.Clear();
+    XTensor a;
+    InitTensor2D(&a, 2, 3, X_FLOAT, 0);
+    a.SetZeroAll();
+    ScaleAndShift(a, 1);
+    a.Dump();
+    printf("dump\n");
+    getchar();
+    return 0;
+}
+
 int main( int argc, const char ** argv )
 {
    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
    //_CrtSetBreakAlloc(2708);

+    //MyTest2();
+    //printf("release\n");
+    //getchar();
+    //GDevs.GPUs[0].Reset();
+    //printf("reset\n");
+    //getchar();
+    //printf("bye.\n");
+
+    MyTest();
+    exit(1);
+
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -55,6 +55,10 @@ const char * GetOPName(int type)
            return "M_ROUND";
        else if (type == MATH_RECIPROCAL)
            return "M_RECIPROCAL";
+        else if (type == MATH_EQUAL)
+            return "M_EQUAL";
+        else if (type == MATH_NOTEQUAL)
+            return "M_NOTEQUAL";
        else if (type == MATH_CLIP)
            return "M_CLIP";
        else if (type == MATH_DIV)
@@ -67,6 +71,10 @@ const char * GetOPName(int type)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
+        else if (type == MATH_MAX)
+            return "M_MAX";
+        else if (type == MATH_MIN)
+            return "M_MIN";
        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
        else if (type == MATH_MULTIPLYDIM)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -46,7 +46,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_ROUND              MATH_TAN + 1
 #define MATH_RECIPROCAL         MATH_ROUND + 1

-#define MATH_CLIP               MATH_RECIPROCAL + 1
+#define MATH_EQUAL              MATH_RECIPROCAL + 1
+#define MATH_NOTEQUAL           MATH_EQUAL + 1
+
+#define MATH_CLIP               MATH_NOTEQUAL + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
 #define MATH_MASK               MATH_DIVDIM + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1784,9 +1784,15 @@ void XTensor::BinaryDump(FILE* file)
    switch (dataType) {
    case X_INT: {
        fwrite(tmp.data, sizeof(int), unitNum, file);
+        break;
+    }
+    case X_FLOAT16: {
+        fwrite(tmp.data, sizeof(float16), unitNum, file);
+        break;
    }
    default: {
        fwrite(tmp.data, sizeof(float), unitNum, file);
+        break;
    }
    }
 }
@@ -1917,12 +1923,21 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
        fread(d, sizeof(int), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
+        break;
+    }
+    case X_FLOAT16: {
+        int* d = new int[unitNum];
+        fread(d, sizeof(float16), unitNum, file);
+        SetData(d, unitNum);
+        delete[] d;
+        break;
    }
    default: {
        float* d = new float[unitNum];
        fread(d, sizeof(float), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
+        break;
    }
    }
 }

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -51,6 +51,7 @@ void KernelSetDataFixed(T * d, T v, int size)
 template __global__ void KernelSetDataFixed<int>(int *, int, int);
 template __global__ void KernelSetDataFixed<float>(float *, float, int);
 template __global__ void KernelSetDataFixed<double>(double *, double, int);
+template __global__ void KernelSetDataFixed<__half>(__half*, __half, int);

 /* 
 generate data items with a fixed value 
@@ -79,6 +80,8 @@ void _CudaSetDataFixed(XTensor * tensor, T value)
        KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
    else if (tensor->dataType == X_DOUBLE)
        KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
+    else if (tensor->dataType == X_FLOAT16)
+        KernelSetDataFixed << <blocks, threads >> > ((__half*)tensor->data, (__half)value, tensor->unitNum);
    else
        ShowNTErrors("TODO! Unsupported datatype!")


--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -92,6 +92,10 @@ XTensor funcName(const XTensor &a, DTYPE number)                                
    XTensor b(&a);                                                                   \
    b.SetTMPFlag();                                                                  \
    _funcName(&a, &b, number);                                                       \
+    if (a.enableGrad) {                                                              \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, (DTYPE)number);                                    \
+    }                                                                                \
    return b;                                                                        \
 }
                                                                                     
@@ -102,6 +106,10 @@ void funcName(const XTensor &a, XTensor &b, DTYPE number)                       
        InitTensorV2(&b, &a);                                                        \
    }                                                                                \
    _funcName(&a, &b, number);                                                       \
+    if (a.enableGrad) {                                                              \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, (DTYPE)number);                                    \
+    }                                                                                \
 }

 // I think we needn't to make link.
@@ -186,6 +194,9 @@ XTensor funcName(const XTensor & a, const XTensor & b)                          
    XTensor c(&a);                                                                   \
    c.SetTMPFlag();                                                                  \
    _funcName(&a, &b, &c);                                                           \
+    if (a.enableGrad && b.enableGrad) {                                              \
+        XLink::MakeLink(&a, &b, &c, operationId);                                    \
+    }                                                                                \
    return c;                                                                        \
 }
                                                                                     
@@ -196,16 +207,33 @@ void funcName(const XTensor &a, const XTensor &b, XTensor c)                    
        InitTensor(&c, &a);                                                          \
    }                                                                                \
    _funcName(&a, &b, &c);                                                           \
+    if (a.enableGrad && b.enableGrad) {                                              \
+        XLink::MakeLink(&a, &b, &c, operationId);                                    \
+    }                                                                                \
 }

 #ifdef USE_CUDA
+_SIMPLE_MAX_MIN_FUNCTION(_Equal, _CudaEqual, myIsEqual)
+_SIMPLE_MAX_MIN_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
 _SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, MAX)
 _SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, MIN)
 #else
+_SIMPLE_MAX_MIN_FUNCTION(_Equal, myIsEqual)
+_SIMPLE_MAX_MIN_FUNCTION(_NotEqual, myIsNotEqual)
 _SIMPLE_MAX_MIN_FUNCTION(_Max, MAX)
 _SIMPLE_MAX_MIN_FUNCTION(_Min, MIN)
 #endif

+_SIMPLE_MAX_MIN_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_MAX_MIN_FUNCTION_ME(EqualMe, _Equal)
+SIMPLE_MAX_MIN_FUNCTION(Equal, _Equal, MATH_EQUAL)
+SIMPLE_MAX_MIN_FUNCTION_VOID(Equal, _Equal, MATH_EQUAL)
+
+_SIMPLE_MAX_MIN_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_MAX_MIN_FUNCTION_ME(NotEqualMe, _NotEqual)
+SIMPLE_MAX_MIN_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+SIMPLE_MAX_MIN_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)
+
 _SIMPLE_MAX_MIN_FUNCTION_ME(_MaxMe, _Max)
 SIMPLE_MAX_MIN_FUNCTION_ME(MaxMe, _Max)
 SIMPLE_MAX_MIN_FUNCTION(Max, _Max, MATH_MAX)

--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
@@ -134,6 +134,9 @@ void _Cuda##funcName(const XTensor * a, const XTensor * b, XTensor * c)     \
    BacktoCudaDev(a->devID, devIDBackup);                                   \
 }    

+SIMPLE_MAX_MIN_FUNCTION_GPU(Equal, cudaIsEqual)
+SIMPLE_MAX_MIN_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
+
 SIMPLE_MAX_MIN_FUNCTION_GPU(Max, max)
 SIMPLE_MAX_MIN_FUNCTION_GPU(Min, min)


--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
@@ -31,9 +31,15 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* check whether every entry is equal to the given value (cuda version) */
 void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);

+/* check whether every entry is equal to the given value (cuda version) */
+void _CudaEqual(const XTensor * a, const XTensor * b, XTensor * c);
+
 /* check whether every entry is not equal to the given value (cuda version) */
 void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);

+/* check whether every entry is not equal to the given value (cuda version) */
+void _CudaNotEqual(const XTensor * a, const XTensor * b, XTensor * c);
+
 /* return maximum of two tensor for each items (cuda version) */
 void _CudaMax(const XTensor * a, const XTensor * b, XTensor *c);


--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -39,7 +39,23 @@ void EqualMe(XTensor & a, DTYPE value);
 XTensor Equal(const XTensor & a, DTYPE value);

 /* check whether every entry is equal to the given value */
-void Equal(const XTensor & a, XTensor & b, DTYPE value);
+void Equal(const XTensor & a, XTensor & b, XTensor & c);
+
+/* check whether every entry is equal to the given value */
+void _Equal(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* check whether every entry is equal to the given value (do it on site) */
+void _EqualMe(XTensor * a, XTensor * b);
+
+/* check whether every entry is equal to the given value (do it on site) */
+void EqualMe(XTensor & a, XTensor & b);
+
+/* check whether every entry is equal to the given value (return an XTensor structure) */
+XTensor Equal(const XTensor & a, const XTensor & b);
+
+/* check whether every entry is equal to the given value */
+void Equal(const XTensor & a, const XTensor & b, XTensor & c);
+

 /* check whether every entry is not equal to the given value */
 void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
@@ -56,6 +72,22 @@ XTensor NotEqual(const XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value */
 void NotEqual(const XTensor & a, XTensor & b, DTYPE value);

+/* check whether every entry is not equal to the given value */
+void _NotEqual(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* check whether every entry is not equal to the given value (do it on site) */
+void _NotEqualMe(XTensor * a, XTensor * b);
+
+/* check whether every entry is not equal to the given value (do it on site) */
+void NotEqualMe(XTensor & a, XTensor * b);
+
+/* check whether every entry is not equal to the given value (return an XTensor structure) */
+XTensor NotEqual(const XTensor & a, const XTensor & b);
+
+/* check whether every entry is not equal to the given value */
+void NotEqual(const XTensor & a, const XTensor & b, XTensor & c);
+
+
 /* return maximum of two tensor for each items */
 void _Max(const XTensor * a, const XTensor * b, XTensor * c);

@@ -71,6 +103,7 @@ XTensor Max(const XTensor & a, const XTensor & b);
 /* return maximum of two tensor for each items */
 void Max(const XTensor & a, const XTensor & b, XTensor & c);

+
 /* return minimum of two tensor for each items */
 void _Min(const XTensor * a, const XTensor * b, XTensor * c);


--- a/source/tensor/core/utilities/Float16.cpp
+++ b/source/tensor/core/utilities/Float16.cpp
--- a/source/tensor/core/utilities/Float16.h
+++ b/source/tensor/core/utilities/Float16.h
-//
-//  float16.h
-//  16bit
-//
-//  Created by 管胡昊 on 2020/2/5.
-//  Copyright © 2020 管胡昊. All rights reserved.
-//
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Creted by: Guan Huhao 2020-02-05
+ * $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
+ */

 #ifndef FLOAT16_H
 #define FLOAT16_H

+namespace nts { // namespace nts(NiuTrans.Tensor)
+
 struct float16
 {
-    //private member variable
 private:
    /* 
    sign is the sign bit 1 means negative, 0 means positive
    exp is the exponent with 16 offset
-    data is the data,similar to ieee-754,the highest is default 1 and ignored 
+    data is the data, similar to ieee-754, the highest is default 1 and ignored 
    */
-
+    unsigned short data : 10;
+    unsigned short exp : 5;
+    unsigned short sign : 1;

    // mask for calculate the highest 1
    static unsigned int mask[32];
    static unsigned int pow2[32];
    
-    // private function
-    int FindHighOne(const int &num, int &l, int &r);
+    //int FindHighOne(const int &num, int &l, int &r);
    int AbsCompare(const float16 & a,const float16 & b);

 public:
-    unsigned short data : 10;
-    unsigned short exp : 5;
-    unsigned short sign : 1;
-
    float16 SetOverFlow();

    // judge whether overflow
    int IsOverlFlow() const;
    
-    /* constructor by sign, exp, data
-       sign:1bit exp:5bit data:10bit similar to ieee 32 floating point */
+    /* constructor by (sign, exp, data)
+       similar to ieee 32 floating point
+       sign: 1bit 
+       exp:  5bit 
+       data: 10bit */
    float16(const int& s, const int& e, const int& d);

    /* default constructor
       This initializes the 16bit floating point to 0. */
    float16();

-    // constructor by a 32-bit floating point
+    // constructor by a 32-bit float num
    float16(const float& data);
-    template<class T> float16(const T& data);

    // constructor by other datatype
-    //template<class T> float16(const T &data);
+    template<class T> float16(const T& data);
+
+    void Dump();

-    // change float16 to flaot as you can see the result is a 32-bit floating point
+    // convert float16 to float and return
    float Float();
    
    /* assignment function and tempalte function
-       float assignment function is the basic function
-       template assignment function is force change other datetype to float 
-       then call the float assignment function
-       template assignment function now support int, double */
-    float16 operator = (const float16& data);
+       Float assignment function is the basic function.
+       Template assignment function is force change other datetype to float,
+       then call the float assignment function.
+       Template assignment function now support int and double. */
    float16 operator = (const float& data);
+    float16 operator = (const float16& data);
    template<class T>  float16 operator = (const T& data);

-    // overload operator (less than) eg. a<b
+    // overload operator (less than) a < b
    int operator < (const float16& data);
-    template<class T> int operator <(const T& data);
+    template<class T>  int operator < (const T& data);

-    // overload opertator <= (less or equal than) a<=b
+    // overload opertator <= (less or equal than) a <= b
    int operator <= (const float16& data);
-    template<class T> int operator <=(const T& data);
+    template<class T> int operator <= (const T& data);

-    // overload operator (greater than) eg. a>b
+    // overload operator (greater than) a > b
    int operator > (const float16& data);
-    template<class T> int operator >(const T& data);
+    template<class T> int operator > (const T& data);

-    //overload opertator <= (greater or equal than) a>=b
+    // overload opertator >= (greater or equal than) a >= b
    int operator >= (const float16& data);
-    template<class T> int operator >=(const T& data);
+    template<class T> int operator >= (const T& data);

-    // overload operator + (add) eg. a+b
+    // overload operator + (add) a + b
    float16 operator + (const float16& data);
-    template<class T> float16 operator +(const T& data);
+    template<class T> float16 operator + (const T& data);

-    // overload operator += (add) eg. a+=b
+    // overload operator += (add) a += b
    float16 operator += (const float16& data);
-    template<class T> float16 operator +=(const T& data);
+    template<class T> float16 operator += (const T& data);

-    // overload operator -(negetive) eg. -a
+    // overload operator -(negetive) -a
    float16 operator - ();

-    // overload operator - (substraction) eg. a-b
+    // overload operator - (substraction) a - b
    float16 operator - (const float16& data);
-    template<class T> float16 operator -(const T& data);
+    template<class T> float16 operator - (const T& data);

-    // overload operator -= (substraction) eg. a-=b
+    // overload operator -= (substraction) a -= b
    float16 operator -= (const float16& data);
-    template<class T> float16 operator -=(const T& data);
+    template<class T> float16 operator -= (const T& data);

-    // overload operator * (multiple) eg. a*b
+    // overload operator * (multiple) a * b
    float16 operator * (const float16& data);
-    template<class T> float16 operator *(const T& data);
+    template<class T> float16 operator * (const T& data);

-    // overload operator *= (multiple) eg. a*=b
+    // overload operator *= (multiple) a *= b
    float16 operator *= (const float16& data);
-    template<class T> float16 operator *=(const T& data);
+    template<class T> float16 operator *= (const T& data);

-    // overload operator / (division) eg. a/b
+    // overload operator / (division) a / b
    float16 GetInverse() const;
    float16 operator / (const float16& data);
-    template<class T> float16 operator /(const T& data);
+    template<class T> float16 operator / (const T& data);

-    // overload operator /= (division) eg. a/=b
+    // overload operator /= (division) a /= b
    float16 operator /= (const float16& data);
-    template<class T> float16 operator /=(const T& data);
+    template<class T> float16 operator /= (const T& data);

 };
+
+} // namespace nts(NiuTrans.Tensor)
+
 #endif /* FLOAT16_H */