new code for ScaleAndShift

27635638 · xiaotong · 43331674 · 27635638 · 27635638 · 27635638
Commit 27635638 authored Jul 10, 2018 by xiaotong
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -28,8 +28,7 @@
 #include <time.h>
 #include "XTensor.h"
 #include "XDevice.h"
-#include "./sample/fnnlm/FNNLM.h"
+#include "sample/fnnlm/FNNLM.h"
 #include "test/Test.h"
 //#define CRTDBG_MAP_ALLOC
@@ -39,26 +38,15 @@
 using namespace nts;
 using namespace samplefnnlm;
+void SmallTest();
 int main( int argc, const char ** argv )
 {
    //_CrtSetBreakAlloc(78);
    /* a tiny test */
-    if(0){
+    if(1)
-        XTensor a;
+        SmallTest();
-        XTensor b;
-        InitTensor2D(&a, 2, 2);
-        a.SetZeroAll();
-        a.Set2D(1.0F, 0, 0);
-        a.Set2D(1.0F, 1, 1);
-        b = Sum(a, Sum(a, a));
-        XTensor c = b;
-        a.Dump(stderr, "a: ");
-        b.Dump(stderr, "b: ");
-    }
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
@@ -75,3 +63,25 @@ int main( int argc, const char ** argv )
    return 0;
 }
+void SmallTest()
+{
+    XTensor a;
+    XTensor b;
+    InitTensor2D(&a, 2, 2);
+    a.SetZeroAll();
+    a.Set2D(1.0F, 0, 0);
+    a.Set2D(2.0F, 1, 1);
+    b = Sum(a, Multiply(a, a));
+    XTensor c = b;
+    XTensor d = b + a + Linear(c, 0.5F);
+    a.Dump(stderr, "a: ");
+    b.Dump(stderr, "b: ");
+    c.Dump(stderr, "c: ");
+    d.Dump(stderr, "d: ");
+}
--- a/source/XGlobal.h
+++ b/source/XGlobal.h
@@ -74,7 +74,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \
@@ -83,7 +83,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
        exit(1); \
    } \
 } \
@@ -91,7 +91,7 @@ namespace nts {
 #define ShowNTErrors(msg) \
 { \
    { \
-        fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \

--- a/source/XName.cpp
+++ b/source/XName.cpp
@@ -19,15 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
 */
-#ifndef __XNAME_H__
+#include "XName.h"
-#define __XNAME_H__
 namespace nts { // namespace nts(NiuTrans.Tensor)
-#define MATH_ARITHMETIC     0x00001000
-#define MATH_SUM            MATH_ARITHMETIC + 1
-#define MATH_MULTIPLY       MATH_SUM + 1
 /* get operator name */
 const char * GetOPName(int type)
 {
@@ -36,6 +31,8 @@ const char * GetOPName(int type)
            return "M_SUM";
        else if(type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if(type == MATH_SCALEANDSHIFT)
+            return "M_SCALEANDSHIFT";
    }
    return "NULL";
@@ -43,4 +40,3 @@ const char * GetOPName(int type)
 } // namespace nts(NiuTrans.Tensor)
-#endif // __XNAME_H__
--- a/source/XName.h
+++ b/source/XName.h
@@ -31,6 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_ARITHMETIC     10000
 #define MATH_SUM            MATH_ARITHMETIC + 1
 #define MATH_MULTIPLY       MATH_SUM + 1
+#define MATH_SCALEANDSHIFT  MATH_MULTIPLY + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
@@ -40,6 +40,7 @@
 #include "XBLAS.h"
 #include "core/shape/MergeBlockLists.h"
 #include "core/movement/CopyValues.h"
+#include "core/arithmetic/Sum.h"
 #ifdef USE_CUDA
@@ -308,6 +309,12 @@ XTensor& XTensor::operator= (const XTensor& tensor)
    return *this;
 }
+/* overloading of the plus-sign */
+XTensor XTensor::operator+ (const XTensor& tensor)
+{
+    return Sum(*this, tensor);
+}
 /* 
 judge whether the two matrices are in the same type and size 
 >> a - input tensor

--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -184,6 +184,9 @@ public:
    /* overloading of the equal-sign */
    XTensor& operator= (const XTensor &tensor);
+    /* overloading of the plus-sign */
+    XTensor  operator+ (const XTensor &tensor);
    /* judge whether the two matrices are in the same type and size */
    static
    bool IsIdentical(XTensor * a, XTensor * b);

--- a/source/core/math/ScaleAndShift.cpp
+++ b/source/core/math/ScaleAndShift.cpp
@@ -19,24 +19,28 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
 #include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
-scale and shift all tensor entires
+scale and shift all tensor entires b = a * scale + shift
-p = p * scale + shift
+b = a * scale + shift
->> a - the tensor
+>> a - the input tensor
+>> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
 */
-void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
+void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if(a->devID >= 0){
-        CudaScaleAndShift(a, scale, shift);
+        _CudaScaleAndShift(a, b, scale, shift);
        return;
    }
 #endif
@@ -46,7 +50,7 @@ void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
    /* sparse tensor */
    if(a->isSparse){
-        int num = a->GetNonzeroSize();
+        int num = a->unitNumNonZero;
        char * d = (char*)a->data + sizeof(int);
        char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
        for(int i = 0; i < num; i++){
@@ -65,4 +69,40 @@ void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
    }
 }
+/* 
+scale and shift all tensor entires on site b = a * scale + shift
+b = a * scale + shift
+>> a - the input/output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
+{
+    _ScaleAndShift(a, a, scale, shift);
+}
+/* 
+scale and shift all tensor entires b = a * scale + shift
+b = a * scale + shift
+>> a - the input tensor
+>> b - the output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* computation */
+    _ScaleAndShift(&a, &b, scale, shift);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_SUM);
+    XLink::AddParamToHead(&b, scale);
+    XLink::AddParamToHead(&b, shift);
+    return b;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/math/ScaleAndShift.cu
+++ b/source/core/math/ScaleAndShift.cu
@@ -19,7 +19,6 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"
 #include "../../XDevice.h"
@@ -28,68 +27,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* 
-scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) 
+scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) 
->> d - the data array
+>> a - the input data array
+>> b - the output data array
 >> size - the size of d
 >> scale - how much we want to scale it
 >> shift - how much we want to shift it
 */
 template<bool isUnitScale, bool isZeroShift>
 __global__ 
-void KernelScaleAndShift(DTYPE * d, int size, DTYPE scale, DTYPE shift)
+void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size){
        if (isUnitScale && !isZeroShift){
-            d[i] = d[i] + shift;
+            b[i] = a[i] + shift;
        }
        else if (isUnitScale && isZeroShift) {
-            d[i] = d[i];
+            b[i] = a[i];
        }
        else if (!isUnitScale && isZeroShift) {
-            d[i] = d[i] * scale;
+            b[i] = a[i] * scale;
        }
        else {
-            d[i] = d[i] * scale + shift;
+            b[i] = a[i] * scale + shift;
        }
    }
 }
 /* 
-scale and shift all matrix entires p = p * scale + shift (CUDA Kernel) 
+scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) 
 This is for float16 computation
->> d - the data array
+>> a - the input data array
+>> b - the output data array
 >> size - the size of d
 >> scale - how much we want to scale it
 >> shift - how much we want to shift it
 */
 __global__ 
-void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
+void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    if(i < size)
-        d[i] = __hadd(__hmul(d[i], scale), shift);
+        b[i] = __hadd(__hmul(a[i], scale), shift);
 #else
    if (i < size)
-        d[i] = __float2half(__half2float(d[i]) * __half2float(scale) + __half2float(shift));
+        b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
 #endif
 }
 /* 
-scale and shift all matrix entires
+scale and shift all tensor entires
 p = p * scale + shift
->> a - the tensor
+>> a - the input tensor
+>> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
 */
-void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
+void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
 {
    /* sparse tensor */
    if(a->isSparse){
-        // TODO
+        ShowNTErrors("TODO!");
    }
    /* dense tensor */
    else{
@@ -106,20 +108,20 @@ void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift)
        if(a->dataType == DEFAULT_DTYPE){
            if(scale == 1.0F && shift == 0)
-                KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if(scale != 1.0F && shift == 0)
-                KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else
-                KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
        }
        else if(a->dataType == X_FLOAT16){
            unsigned short scale2 = FloatToFloat16(scale);
            unsigned short shift2 = FloatToFloat16(shift);
            __half * scaleft16p = (__half*)&scale2;
            __half * shiftft16p = (__half*)&shift2;
-            KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, a->unitNum, *scaleft16p, *shiftft16p);
+            KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
        }
        else{
            ShowNTErrors("TODO!");

--- a/source/core/math/ScaleAndShift.cuh
+++ b/source/core/math/ScaleAndShift.cuh
@@ -22,23 +22,22 @@
 #ifndef __SCALEANDSHIFT_CUH__
 #define __SCALEANDSHIFT_CUH__
-#include "../../XTensor.h"
+#include "ScaleAndShift.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) */
+/* scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) */
 __global__ 
-void KernelScaleAndShift(DTYPE * d, int size, DTYPE scale, DTYPE shift);
+void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift);
-/* scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) with float16 data type */
+/* scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) with float16 data type */
 __global__ 
-void KernelScaleAndShift(__half * d, int size, __half scale, __half shift);
+void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
-/* scale and shift all tensor entires (cuda version) */
+/* scale and shift all tensor entires b = a * scale + shift (cuda version) */
-extern "C" 
+void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
-void CudaScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
 #endif // USE_CUDA

--- a/source/core/math/ScaleAndShift.h
+++ b/source/core/math/ScaleAndShift.h
@@ -26,9 +26,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-/* scale and shift all tensor entires */
+#define _Linear   _ScaleAndShift   
-extern "C" 
+#define _LinearMe _ScaleAndShiftMe
-void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
+#define  Linear    ScaleAndShift
+/* scale and shift all tensor entires b = a * scale + shift */
+void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0);
+/* scale and shift all tensor entires on site a = a * scale + shift */
+void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
+/* scale and shift all tensor entires b = a * scale + shift, and return the result tensor b */
+XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/core/reduce/ReduceMean.cpp
+++ b/source/core/reduce/ReduceMean.cpp
@@ -40,7 +40,7 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
    int num = input->dimSizeRDI[dimRDI];
    ReduceSum(input, output, dim);
-    ScaleAndShift(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/reduce/ReduceVariance.cpp
+++ b/source/core/reduce/ReduceVariance.cpp
@@ -39,7 +39,7 @@ void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean)
 	int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];
    ReduceSum(input, output, dim, mean, 2.0F);
-    ScaleAndShift(output, (DTYPE)1 / num, 0);
+    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
 }
 } // namespace nts(NiuTrans.Tensor)

--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
@@ -288,7 +288,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
            /* \beta = \sum_i (dE/dy_i * y_i) */
-            Multiply(dedy, y, ytmp, 0);
+            _Multiply(dedy, y, ytmp, 0, 0);
            ReduceSum(ytmp, beta, leadDim);
            /* ytmp = dE/dy_j - \beta */
@@ -296,7 +296,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            _Sum(dedy, ytmp, ytmp, -1.0F);
            /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
-            Multiply(y, ytmp, dedx, 0);
+            _Multiply(y, ytmp, dedx, 0, 0);
            mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
            mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -485,7 +485,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    InitTensor(&probs, &output);
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    Multiply(&output, &gold, &probs, 0);
+    _Multiply(&output, &gold, &probs);
    /* probability of each word */
    XTensor wprobs;

--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
@@ -54,8 +54,8 @@ bool TestLoss1()
    /* initialize variables */
    output->SetZeroAll();
    gold->SetZeroAll();
-    ScaleAndShift(output, 1, 1);
+    _ScaleAndShiftMe(output, 1, 1);
-    ScaleAndShift(gold, 1, 2);
+    _ScaleAndShiftMe(gold, 1, 2);
    DTYPE error;
    error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
@@ -74,8 +74,8 @@ bool TestLoss1()
    /* Initialize variables */
    outputGPU->SetZeroAll();
    goldGPU->SetZeroAll();
-    ScaleAndShift(outputGPU, 1, 1);
+    _ScaleAndShiftMe(outputGPU, 1, 1);
-    ScaleAndShift(goldGPU, 1, 2);
+    _ScaleAndShiftMe(goldGPU, 1, 2);
    /* call LossCompute function */
    error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
@@ -131,8 +131,8 @@ bool TestLoss2()
    /* initialize variables */
    output->SetZeroAll();
    gold->SetZeroAll();
-    ScaleAndShift(output, 1, 1);
+    _ScaleAndShiftMe(output, 1, 1);
-    ScaleAndShift(gold, 1, 2);
+    _ScaleAndShiftMe(gold, 1, 2);
    DTYPE error;
    error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
@@ -151,8 +151,8 @@ bool TestLoss2()
    /* Initialize variables */
    outputGPU->SetZeroAll();
    goldGPU->SetZeroAll();
-    ScaleAndShift(outputGPU, 1, 1);
+    _ScaleAndShiftMe(outputGPU, 1, 1);
-    ScaleAndShift(goldGPU, 1, 2);
+    _ScaleAndShiftMe(goldGPU, 1, 2);
    /* call LossCompute function */
    error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);

--- a/source/test/TMultiply.cpp
+++ b/source/test/TMultiply.cpp
@@ -81,7 +81,7 @@ bool TestMultiply1()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(s1, s2, t, 0);
+	_Multiply(s1, s2, t, 0, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -101,7 +101,7 @@ bool TestMultiply1()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(sGPU1, sGPU2, tGPU, 0);
+	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -189,7 +189,7 @@ bool TestMultiply2()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(s1, s2, t, 0);
+	_Multiply(s1, s2, t, 0, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -209,7 +209,7 @@ bool TestMultiply2()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(sGPU1, sGPU2, tGPU, 0);
+	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -296,7 +296,7 @@ bool TestMultiply3()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(s1, s2, t, 1);
+	_Multiply(s1, s2, t, 0, 1);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -316,7 +316,7 @@ bool TestMultiply3()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	Multiply(sGPU1, sGPU2, tGPU, 1);
+	_Multiply(sGPU1, sGPU2, tGPU, 0, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
@@ -57,7 +57,7 @@ bool TestScaleAndShift1()
    s->SetData(sData, sUnitNum);
    /* call ScaleAndShift function */
-    ScaleAndShift(s, scaleFactor, shiftFactor);
+    _ScaleAndShift(s, s, scaleFactor, shiftFactor);
    /* check results */
    cpuTest = s->CheckData(answer, sUnitNum);
@@ -73,7 +73,7 @@ bool TestScaleAndShift1()
    sGPU->SetData(sData, sUnitNum);
    /* call ScaleAndShift function */
-    ScaleAndShift(sGPU, scaleFactor, shiftFactor);
+    _ScaleAndShift(sGPU, sGPU, scaleFactor, shiftFactor);
    /* check results */
    gpuTest = sGPU->CheckData(answer, sUnitNum);