Commit 38bff350 by xuchen

merge with liyinqiao brach

parent 509c0233
...@@ -32,10 +32,6 @@ ...@@ -32,10 +32,6 @@
//#include <stdlib.h> //#include <stdlib.h>
//#include <crtdbg.h> //#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
using namespace transformer; using namespace transformer;
......
...@@ -200,7 +200,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -200,7 +200,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP); labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAll(lossTensor); float lossBatch = ReduceSumAllValue(lossTensor);
DTYPE lossLocal = lossBatch / wc; DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
...@@ -345,7 +345,7 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model) ...@@ -345,7 +345,7 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
XTensor lossTensor; XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0); labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAll(lossTensor); float lossBatch = ReduceSumAllValue(lossTensor);
/* dump the test result */ /* dump the test result */
for(int s = 0; s < bSize; s++){ for(int s = 0; s < bSize; s++){
......
...@@ -130,6 +130,39 @@ void InitTensor(XTensor * tensor, ...@@ -130,6 +130,39 @@ void InitTensor(XTensor * tensor,
} }
/* /*
initialize a scalar V2
>> tensor - the tensor we intend to initialize
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
>> myMem - memory pool used to allocating the data array
myMem = NULL means that the tensor is allocated on
the device dynamically, rather than on the memory pool
*/
void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
{
int dims[MAX_TENSOR_DIM_NUM];
InitTensorV2(tensor, 0, dims, myDataType, 1.0F, myDevID, myMem);
}
/*
initialize a scalar
>> tensor - the tensor we intend to initialize
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
*/
void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
{
int dims[MAX_TENSOR_DIM_NUM];
InitTensor(tensor, 0, dims, myDataType, myDevID, isEnableGrad);
}
/*
initialize a dense tensor V2 initialize a dense tensor V2
>> tensor - the tensor we intend to initialize >> tensor - the tensor we intend to initialize
>> num - number of elements >> num - number of elements
...@@ -551,6 +584,37 @@ XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnable ...@@ -551,6 +584,37 @@ XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnable
} }
/* /*
generate a scalar V2
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
>> myMem - memory pool used to allocating the data array
myMem = NULL means that the tensor is allocated on
the device dynamically, rather than on the memory pool.
*/
XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
{
int dims[MAX_TENSOR_DIM_NUM];
return NewTensorV2(0, dims, myDataType, 1.0F, myDevID, myMem);
}
/*
generate a scalar
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site.
*/
XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
{
int dims[MAX_TENSOR_DIM_NUM];
return NewTensor(0, dims, myDataType, myDevID, isEnableGrad);
}
/*
generate a dense vector V2 generate a dense vector V2
>> num - number of entries >> num - number of entries
>> myDataType - unit size (e.g., int, float, and double) >> myDataType - unit size (e.g., int, float, and double)
...@@ -799,7 +863,7 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData) ...@@ -799,7 +863,7 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM); memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
if(a->order > 0) if(a->order >= 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order); memcpy(dims, a->dimSize, sizeof(int) * a->order);
if(!isFilledData) if(!isFilledData)
...@@ -810,7 +874,6 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData) ...@@ -810,7 +874,6 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
a->devID, a->mem); a->devID, a->mem);
return newTensor; return newTensor;
} }
/* /*
......
...@@ -26,6 +26,9 @@ ...@@ -26,6 +26,9 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* default settings */
#define X_ENABLE_GRAD true
/* /*
* we define the "new and delete" functions below * we define the "new and delete" functions below
*/ */
...@@ -38,7 +41,13 @@ void InitTensorV2(XTensor * tensor, ...@@ -38,7 +41,13 @@ void InitTensorV2(XTensor * tensor,
/* initialize a dense XTensor */ /* initialize a dense XTensor */
void InitTensor(XTensor * tensor, void InitTensor(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a scalar V2 */
void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a scalar */
void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense vector V2 */ /* initialize a dense vector V2 */
void InitTensor1DV2(XTensor * tensor, const int num, void InitTensor1DV2(XTensor * tensor, const int num,
...@@ -46,7 +55,7 @@ void InitTensor1DV2(XTensor * tensor, const int num, ...@@ -46,7 +55,7 @@ void InitTensor1DV2(XTensor * tensor, const int num,
/* initialize a dense vector */ /* initialize a dense vector */
void InitTensor1D(XTensor * tensor, const int num, void InitTensor1D(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense matrix V2 */ /* initialize a dense matrix V2 */
void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum, void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
...@@ -54,7 +63,7 @@ void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum, ...@@ -54,7 +63,7 @@ void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
/* initialize a dense matrix */ /* initialize a dense matrix */
void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum, void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 3d tensor V2 */ /* initialize a dense 3d tensor V2 */
void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2, void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
...@@ -62,7 +71,7 @@ void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2, ...@@ -62,7 +71,7 @@ void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 3d tensor */ /* initialize a dense 3d tensor */
void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2, void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 4d tensor V2 */ /* initialize a dense 4d tensor V2 */
void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
...@@ -70,7 +79,7 @@ void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, ...@@ -70,7 +79,7 @@ void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 4d tensor */ /* initialize a dense 4d tensor */
void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 5d tensor V2 */ /* initialize a dense 5d tensor V2 */
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4, void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
...@@ -78,7 +87,7 @@ void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, ...@@ -78,7 +87,7 @@ void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 5d tensor */ /* initialize a dense 5d tensor */
void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4, void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a tensor with a reference tensor V2 */ /* initialize a tensor with a reference tensor V2 */
void InitTensorV2(XTensor * tensor, const XTensor * reference); void InitTensorV2(XTensor * tensor, const XTensor * reference);
...@@ -98,7 +107,7 @@ XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DAT ...@@ -98,7 +107,7 @@ XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DAT
/* generate a dense XTensor */ /* generate a dense XTensor */
XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT, XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a XTensor which allocates data on the buffer V2 */ /* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize, XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
...@@ -107,20 +116,26 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize, ...@@ -107,20 +116,26 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
/* generate a dense XTensor which allocates data on the buffer */ /* generate a dense XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a XTensor which allocates data on the buffer V2 */ /* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem); XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem);
/* generate a XTensor which allocates data on the buffer */ /* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = true); XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a scalar V2 */
XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* generate a scalar */
XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense vector V2 */ /* generate a dense vector V2 */
XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1,
XMem * myMem = NULL); XMem * myMem = NULL);
/* generate a dense vector */ /* generate a dense vector */
XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true); XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense matrix V2 */ /* generate a dense matrix V2 */
XTensor * NewTensor2DV2(const int rowNum, const int colNum, XTensor * NewTensor2DV2(const int rowNum, const int colNum,
...@@ -130,7 +145,7 @@ XTensor * NewTensor2DV2(const int rowNum, const int colNum, ...@@ -130,7 +145,7 @@ XTensor * NewTensor2DV2(const int rowNum, const int colNum,
/* generate a dense matrix */ /* generate a dense matrix */
XTensor * NewTensor2D(const int rowNum, const int colNum, XTensor * NewTensor2D(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 3d tensor V2 */ /* generate a dense 3d tensor V2 */
XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
...@@ -140,7 +155,7 @@ XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, ...@@ -140,7 +155,7 @@ XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
/* generate a dense 3d tensor */ /* generate a dense 3d tensor */
XTensor * NewTensor3D(const int d0, const int d1, const int d2, XTensor * NewTensor3D(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 4d tensor V2 */ /* generate a dense 4d tensor V2 */
XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3, XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
...@@ -150,7 +165,7 @@ XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3, ...@@ -150,7 +165,7 @@ XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
/* generate a dense 4d tensor */ /* generate a dense 4d tensor */
XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3, XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 5d tensor V2 */ /* generate a dense 5d tensor V2 */
XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4, XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
...@@ -160,10 +175,10 @@ XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, ...@@ -160,10 +175,10 @@ XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3,
/* generate a dense 5d tensor */ /* generate a dense 5d tensor */
XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4, XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true); const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense vector by range */ /* generate a dense vector by range */
XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true); XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a copy of XTensor (with a reference to a given tensor) */ /* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(const XTensor * a, bool isFilledData = true); XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
......
...@@ -78,7 +78,7 @@ namespace nts { ...@@ -78,7 +78,7 @@ namespace nts {
if(!(x)) \ if(!(x)) \
{ \ { \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \ fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
exit(1); \ throw; \
} \ } \
} \ } \
...@@ -87,7 +87,7 @@ namespace nts { ...@@ -87,7 +87,7 @@ namespace nts {
if(!(x)) \ if(!(x)) \
{ \ { \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \ fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
exit(1); \ throw; \
} \ } \
} \ } \
...@@ -95,7 +95,7 @@ namespace nts { ...@@ -95,7 +95,7 @@ namespace nts {
{ \ { \
{ \ { \
fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \ fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
exit(1); \ throw; \
} \ } \
} \ } \
......
...@@ -167,7 +167,7 @@ void XLink::SetType(int id) ...@@ -167,7 +167,7 @@ void XLink::SetType(int id)
type[0] = 0; type[0] = 0;
strcpy(type, GetOPName(id)); strcpy(type, GetOPName(id));
typeID = id; typeID = id;
if(id != 0){ if (id != 0) {
CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!"); CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
} }
} }
......
...@@ -249,26 +249,6 @@ inline int TensorListBase<T>::FindFirst(const T& item) ...@@ -249,26 +249,6 @@ inline int TensorListBase<T>::FindFirst(const T& item)
return -1; return -1;
} }
template <>
inline int TensorListBase<Example>::FindFirst(const Example& item)
{
for (int i = 0; i < count; i++) {
if (item.id == items[i].id)
return i;
}
return -1;
}
template <>
inline int TensorListBase<Result>::FindFirst(const Result& item)
{
for (int i = 0; i < count; i++) {
if (item.id == items[i].id)
return i;
}
return -1;
}
/* clear the data array */ /* clear the data array */
template <typename T> template <typename T>
void TensorListBase<T>::Clear() void TensorListBase<T>::Clear()
...@@ -383,8 +363,7 @@ template struct TensorListBase<long>; ...@@ -383,8 +363,7 @@ template struct TensorListBase<long>;
template struct TensorListBase<float>; template struct TensorListBase<float>;
template struct TensorListBase<short>; template struct TensorListBase<short>;
template struct TensorListBase<XTensor*>; template struct TensorListBase<XTensor*>;
template struct TensorListBase<Result>; template struct TensorListBase<uint64_t>;
template struct TensorListBase<Example>;
template struct TensorListBase<void*>; template struct TensorListBase<void*>;
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#include "XMem.h" #include "XMem.h"
#include "XGlobal.h" #include "XGlobal.h"
#include <cstdint>
#ifndef __TensorList_H__ #ifndef __TensorList_H__
#define __TensorList_H__ #define __TensorList_H__
...@@ -118,7 +120,14 @@ public: ...@@ -118,7 +120,14 @@ public:
void Shuffle(int nround = 10, int beg = -1, int len = 0); void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */ /* short */
T& operator[] (int i) { return GetItem(i); }; T& operator[] (int i) {
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
if (i < 0)
return items[count + i];
else
return items[i];
};
T& Get(int i) { return GetItem(i); }; T& Get(int i) { return GetItem(i); };
void Set(int i, T item) { SetItem(i, item); }; void Set(int i, T item) { SetItem(i, item); };
}; };
...@@ -132,19 +141,7 @@ typedef TensorListBase<char*> StrList; ...@@ -132,19 +141,7 @@ typedef TensorListBase<char*> StrList;
typedef TensorListBase<long> LongList; typedef TensorListBase<long> LongList;
typedef TensorListBase<float> FloatList; typedef TensorListBase<float> FloatList;
typedef TensorListBase<short> ShortList; typedef TensorListBase<short> ShortList;
typedef TensorListBase<uint64_t> UInt64List;
struct Example {
int id;
IntList data;
};
struct Result {
int id;
IntList data;
};
typedef TensorListBase<Result> ResultList;
typedef TensorListBase<Example> ExampleList;
typedef TensorListBase<XTensor*> TensorList; typedef TensorListBase<XTensor*> TensorList;
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
......
...@@ -53,6 +53,8 @@ const char * GetOPName(int type) ...@@ -53,6 +53,8 @@ const char * GetOPName(int type)
return "M_TAN"; return "M_TAN";
else if (type == MATH_ROUND) else if (type == MATH_ROUND)
return "M_ROUND"; return "M_ROUND";
else if (type == MATH_RECIPROCAL)
return "M_RECIPROCAL";
else if (type == MATH_CLIP) else if (type == MATH_CLIP)
return "M_CLIP"; return "M_CLIP";
else if (type == MATH_DIV) else if (type == MATH_DIV)
...@@ -105,6 +107,8 @@ const char * GetOPName(int type) ...@@ -105,6 +107,8 @@ const char * GetOPName(int type)
return "R_REDUCEMEAN"; return "R_REDUCEMEAN";
else if (type == REDUCE_REDUCESUM) else if (type == REDUCE_REDUCESUM)
return "R_REDUCESUM"; return "R_REDUCESUM";
else if (type == REDUCE_REDUCESUMALL)
return "R_REDUCESUMALL";
else if (type == REDUCE_REDUCESUMSQUARED) else if (type == REDUCE_REDUCESUMSQUARED)
return "R_REDUCESUMSQUARED"; return "R_REDUCESUMSQUARED";
else if (type == REDUCE_REDUCEVARIANCE) else if (type == REDUCE_REDUCEVARIANCE)
...@@ -113,6 +117,8 @@ const char * GetOPName(int type) ...@@ -113,6 +117,8 @@ const char * GetOPName(int type)
else if ((type & DATA_BASE) != 0){ else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_SELECT) if (type == GETANDSET_SELECT)
return "G_SELECT"; return "G_SELECT";
else if (type == GETANDSET_CONVERTDATATYPE)
return "G_CONVERTDATATYPE";
else if (type == MOVEMENT_COPYINDEXED) else if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED"; return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES) else if (type == MOVEMENT_COPYVALUES)
......
...@@ -44,8 +44,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -44,8 +44,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_COS MATH_SIN + 1 #define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1 #define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1 #define MATH_ROUND MATH_TAN + 1
#define MATH_RECIPROCAL MATH_ROUND + 1
#define MATH_CLIP MATH_ROUND + 1 #define MATH_CLIP MATH_RECIPROCAL + 1
#define MATH_DIV MATH_CLIP + 1 #define MATH_DIV MATH_CLIP + 1
#define MATH_DIVDIM MATH_DIV + 1 #define MATH_DIVDIM MATH_DIV + 1
#define MATH_MASK MATH_DIVDIM + 1 #define MATH_MASK MATH_DIVDIM + 1
...@@ -76,7 +77,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -76,7 +77,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define REDUCE_REDUCEMAX REDUCE + 1 #define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1 #define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1 #define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1 #define REDUCE_REDUCESUMALL REDUCE_REDUCESUM + 1
#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUMALL + 1
#define REDUCE_REDUCEVARIANCE REDUCE_REDUCESUMSQUARED + 1 #define REDUCE_REDUCEVARIANCE REDUCE_REDUCESUMSQUARED + 1
/* data and shape related operations */ /* data and shape related operations */
......
...@@ -147,7 +147,11 @@ void XStream::StreamSynchronize() ...@@ -147,7 +147,11 @@ void XStream::StreamSynchronize()
void XStream::ThreadSynchronize() void XStream::ThreadSynchronize()
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize(); cudaThreadSynchronize();
#else
ShowNTErrors("TODO!");
#endif
#endif #endif
} }
......
...@@ -29,7 +29,6 @@ ...@@ -29,7 +29,6 @@
#define __XTENSOR_H__ #define __XTENSOR_H__
#include "XGlobal.h" #include "XGlobal.h"
#include "XMem.h"
#include "XPRunner.h" #include "XPRunner.h"
#include "XStream.h" #include "XStream.h"
#include "XHeap.h" #include "XHeap.h"
...@@ -276,6 +275,18 @@ public: ...@@ -276,6 +275,18 @@ public:
/* return a tensor that datatype is same as the special tensor */ /* return a tensor that datatype is same as the special tensor */
XTensor TypeAs(const XTensor input); XTensor TypeAs(const XTensor input);
/* return a tensor that datatype is integer */
XTensor Int();
/* return a tensor that datatype is float */
XTensor Float();
/* return a tensor that datatype is float16 */
XTensor Float16();
/* return a tensor that datatype is double */
XTensor Double();
/* get the number of items in the data array */ /* get the number of items in the data array */
int GetSize() const; int GetSize() const;
...@@ -331,6 +342,9 @@ public: ...@@ -331,6 +342,9 @@ public:
/* get the pointer to a cell */ /* get the pointer to a cell */
void * GetCell(int index[], int size = -1) const; void * GetCell(int index[], int size = -1) const;
/* get the default type value of a cell in a 0d tensor */
DTYPE Get0D() const;
/* get the default type value of a cell in a 1d tensor */ /* get the default type value of a cell in a 1d tensor */
DTYPE Get1D(int i) const; DTYPE Get1D(int i) const;
...@@ -343,6 +357,9 @@ public: ...@@ -343,6 +357,9 @@ public:
/* get the int value of a cell by its offset */ /* get the int value of a cell by its offset */
int GetInt(int offset) const; int GetInt(int offset) const;
/* get the int value of a cell in a 0d tensor */
int Get0DInt() const;
/* get the int value of a cell in a 1d tensor */ /* get the int value of a cell in a 1d tensor */
int Get1DInt(int i) const; int Get1DInt(int i) const;
...@@ -364,6 +381,9 @@ public: ...@@ -364,6 +381,9 @@ public:
/* set the value of a cell with its offset in the array */ /* set the value of a cell with its offset in the array */
bool Set(DTYPE value, int offset); bool Set(DTYPE value, int offset);
/* set the value of a cell in a 0d tensor */
bool Set0D(DTYPE value);
/* set the value of a cell in a 1d tensor */ /* set the value of a cell in a 1d tensor */
bool Set1D(DTYPE value, int i); bool Set1D(DTYPE value, int i);
...@@ -379,6 +399,9 @@ public: ...@@ -379,6 +399,9 @@ public:
/* set the integer value of a cell */ /* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1); bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 0d tensor */
bool Set0DInt(int value);
/* set the integer value of a cell in a 1d tensor */ /* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i); bool Set1DInt(int value, int i);
......
...@@ -37,7 +37,6 @@ ...@@ -37,7 +37,6 @@
#include "arithmetic/Multiply.h" #include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h" #include "arithmetic/MultiplyDim.h"
#include "arithmetic/Sub.h" #include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h" #include "arithmetic/Sum.h"
#include "arithmetic/SumDim.h" #include "arithmetic/SumDim.h"
#include "arithmetic/XTensorBLAS.h" #include "arithmetic/XTensorBLAS.h"
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../shape/IsSameShaped.h" #include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "../math/ScaleAndShift.h"
#include "Div.h" #include "Div.h"
#include "Div.cuh" #include "Div.cuh"
#include "DivDim.h" #include "DivDim.h"
...@@ -127,7 +129,7 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le ...@@ -127,7 +129,7 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
element-wise division of two tensors (do it on site) element-wise division of two tensors (do it on site)
keep the result in the input tensor a and return nothing keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i) a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the item where i is the index of the item
>> a - tensor a (where keep the result) >> a - tensor a (where keep the result)
...@@ -144,7 +146,7 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim) ...@@ -144,7 +146,7 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
element-wise division of two tensors (do it on site) element-wise division of two tensors (do it on site)
keep the result in the input tensor a and return nothing keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i) a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the item where i is the index of the item
>> a - tensor a (where keep the result) >> a - tensor a (where keep the result)
...@@ -152,45 +154,35 @@ where i is the index of the item ...@@ -152,45 +154,35 @@ where i is the index of the item
>> alpha - the coefficient >> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting >> leadingDim - the dimension along which we perform broadcasting
*/ */
void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim) void DivMe(XTensor & a, const XTensor & b, DTYPE alpha, int leadingDim)
{ {
_Div(&a, &b, &a, alpha, leadingDim); if (b.order == 0){
} DTYPE scale = 1.0F / b.Get0D() + alpha;
/* _ScaleAndShift(&a, &a, scale, 0.0F);
return a dimension if the division is performed as DivDim (in more details in DivDim.h)
>> a - a tensor
>> b - another tensor for division
*/
int GetDivDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
} }
else {
int n = GetBroadcastDimIndex(a, b);
if(hitCount == 1) if (n == -1) {
return hitDim; CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */
_Div(&a, &b, &a, alpha, leadingDim);
}
else if (n >= 0 && n < a.order)
/* call _DivDim function */
_DivDim(&a, &b, &a, n, alpha);
else else
return -1; ShowNTErrors("Something is wrong!");
}
} }
/* /*
element-wise division of two tensors (return an XTensor structure) element-wise division of two tensors (return an XTensor structure)
make a new tensor c to keep the result and return it make a new tensor c to keep the result and return it
c(i) = a(i)*b(i) c(i) = a(i)/b(i)
where i is the index of the item where i is the index of the item
>> a - tensor a >> a - tensor a
...@@ -199,12 +191,18 @@ where i is the index of the item ...@@ -199,12 +191,18 @@ where i is the index of the item
>> leadingDim - the dimension along which we perform broadcasting >> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors << return - the product of the tensors
*/ */
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) XTensor Div(const XTensor & a, const XTensor & b, int leadingDim)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetDivDimIndex(a, b); if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
ScaleAndShift(a, c, scale, 0.0F);
}
else {
DTYPE alpha = 0.0F;
int n = GetBroadcastDimIndex(a, b);
if(n == -1){ if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!"); CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
...@@ -215,8 +213,6 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) ...@@ -215,8 +213,6 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
/* tensor connections */ /* tensor connections */
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_DIV); XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
} }
} }
else if(n >= 0 && n < a.order){ else if(n >= 0 && n < a.order){
...@@ -227,12 +223,12 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) ...@@ -227,12 +223,12 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM); XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
} }
} }
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
return c; return c;
} }
...@@ -249,25 +245,36 @@ where i is the index of the item ...@@ -249,25 +245,36 @@ where i is the index of the item
>> alpha - the coefficient >> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting >> leadingDim - the dimension along which we perform broadcasting
*/ */
void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim) void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int leadingDim)
{ {
if (!c.isInit || !IsSameShaped(a, c)) { if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a); InitTensorV2(&c, &a);
} }
int n = GetDivDimIndex(a, b); if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) { if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!"); CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */ /* call _Div function */
_Div(&a, &b, &c, 0, leadingDim); _Div(&a, &b, &c, alpha, leadingDim);
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIV); XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
} }
} }
else if (n >= 0 && n < a.order) { else if (n >= 0 && n < a.order) {
...@@ -278,13 +285,12 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin ...@@ -278,13 +285,12 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM); XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
} }
} }
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it ...@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
c(i) = a(i)/b(i) c(i) = a(i)/b(i)
where i is the index of the element where i is the index of the element
*/ */
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0); XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
/* /*
element-wise division of two tensors: element-wise division of two tensors:
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
*/ */
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XTensor.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -39,7 +39,7 @@ c = a - b * \beta ...@@ -39,7 +39,7 @@ c = a - b * \beta
>> alpha - value >> alpha - value
*/ */
__global__ __global__
void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha) void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -159,6 +159,10 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -159,6 +159,10 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"The code must be run on the same GPU!"); "The code must be run on the same GPU!");
int devIDBackup; int devIDBackup;
if (beta == 0)
c->SetZeroAll();
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
......
...@@ -156,6 +156,9 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -156,6 +156,9 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL) if (stream != NULL)
cublasSetStream(*handle, stream->stream); cublasSetStream(*handle, stream->stream);
if (beta == 0)
c->SetZeroAll();
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) { if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
......
...@@ -54,6 +54,9 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -54,6 +54,9 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int aColNum = am; int aColNum = am;
int bColNum = bm; int bColNum = bm;
if (beta == 0)
c->SetZeroAll();
/* a * b */ /* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) { if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm, RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
......
...@@ -118,6 +118,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -118,6 +118,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSize[i]; blockNum *= a->dimSize[i];
} }
if (beta == 0)
c->SetZeroAll();
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
......
...@@ -27,36 +27,6 @@ ...@@ -27,36 +27,6 @@
#include "Sum.h" #include "Sum.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor
>> b - another tensor for sum
*/
int GetSumIndex(const XTensor &a, const XTensor &b)
{
if (a.order < b.order)
return -1;
if (IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for (int i = 0; i < b.order; i++) {
if (b.dimSize[b.order - 1 - i] == 1)
continue;
else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
hitCount++;
hitDim = a.order - b.order + i;
}
}
if (hitCount == 1)
return hitDim;
else
return -1;
}
/* /*
operation c = x * w + b MulAndShift operation c = x * w + b MulAndShift
>> x - tensor x >> x - tensor x
...@@ -99,7 +69,10 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -99,7 +69,10 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
XTensor c(tmp); XTensor c(tmp);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetSumIndex(tmp, b); if (b.order == 0)
ScaleAndShift(*tmp, c, 1.0F, b.Get0D());
else {
int n = GetBroadcastDimIndex(tmp, b);
if (n == -1) { if (n == -1) {
/* call _Sum function */ /* call _Sum function */
...@@ -107,23 +80,22 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -107,23 +80,22 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
else if (n >= 0 && n < tmp->order) { else if (n >= 0 && n < tmp->order) {
/* call _SumDim function */ /* call _SumDim function */
_SumDim(tmp, &b, &c, n); _SumDim(tmp, &b, &c, n);
} }
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
/* tensor connections */ /* tensor connections */
if (w.enableGrad && b.enableGrad) { if (w.enableGrad && b.enableGrad) {
XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT); XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadTrans(&c, X_NOTRANS); XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS); XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
}
} }
/* destroy variables */ /* destroy variables */
...@@ -174,7 +146,7 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA, ...@@ -174,7 +146,7 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
XTensor c(tmp); XTensor c(tmp);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetSumIndex(tmp, b); int n = GetBroadcastDimIndex(tmp, b);
if (n == -1) { if (n == -1) {
/* call _Sum function */ /* call _Sum function */
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../shape/IsSameShaped.h" #include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "../math/ScaleAndShift.h"
#include "Multiply.h" #include "Multiply.h"
#include "Multiply.cuh" #include "Multiply.cuh"
#include "MultiplyDim.h" #include "MultiplyDim.h"
...@@ -155,36 +157,28 @@ where i is the index of the item ...@@ -155,36 +157,28 @@ where i is the index of the item
*/ */
void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim) void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
{ {
_Multiply(&a, &b, &a, alpha, leadingDim); if (b.order == 0){
} DTYPE scale = b.Get0D() + alpha;
/* _ScaleAndShift(&a, &a, scale, 0.0F);
return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h) }
>> a - a tensor else {
>> b - another tensor for multiplication int n = GetBroadcastDimIndex(a, b);
*/
int GetMultiplyDimIndex(const XTensor &a, const XTensor &b) if (n == -1) {
{ CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
if(a.order < b.order)
return -1; /* call _Multiply function */
if(IsSameShaped(a, b)) _Multiply(&a, &b, &a, alpha, leadingDim);
return -1; }
else if (n >= 0 && n < a.order) {
int hitCount = 0; /* call _MultiplyDim function */
int hitDim = -1; _MultiplyDim(&a, &b, &a, n, alpha);
for(int i = 0; i < b.order; i++){ }
if(b.dimSize[b.order - 1 - i] == 1) else {
continue; ShowNTErrors("Something is wrong!");
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){ }
hitCount++; }
hitDim = a.order - b.order + i;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
} }
/* /*
...@@ -199,25 +193,28 @@ where i is the index of the item ...@@ -199,25 +193,28 @@ where i is the index of the item
>> leadingDim - the dimension along which we perform broadcasting >> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors << return - the product of the tensors
*/ */
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetMultiplyDimIndex(a, b); if (b.order == 0){
DTYPE scale = b.Get0D();
ScaleAndShift(a, c, scale, 0.0F);
}
else {
DTYPE alpha = 0.0F;
int n = GetBroadcastDimIndex(a, b);
if(n == -1){ if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!"); CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */ /* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim); _Multiply(&a, &b, &c, alpha, leadingDim);
/* tensor connections */ /* tensor connections */
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
} }
} }
else if(n >= 0 && n < a.order){ else if(n >= 0 && n < a.order){
...@@ -228,12 +225,12 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim ...@@ -228,12 +225,12 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
} }
} }
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
return c; return c;
} }
...@@ -256,19 +253,30 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l ...@@ -256,19 +253,30 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
InitTensorV2(&c, &a); InitTensorV2(&c, &a);
} }
int n = GetMultiplyDimIndex(a, b); if (b.order == 0){
DTYPE scale = b.Get0D();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) { if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!"); CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */ /* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim); _Multiply(&a, &b, &c, alpha, leadingDim);
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
} }
} }
else if (n >= 0 && n < a.order) { else if (n >= 0 && n < a.order) {
...@@ -279,13 +287,12 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l ...@@ -279,13 +287,12 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
} }
} }
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it ...@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
c(i) = a(i)*b(i) c(i) = a(i)*b(i)
where i is the index of the element where i is the index of the element
*/ */
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0); XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
/* /*
element-wise product of two tensors: element-wise product of two tensors:
......
...@@ -233,7 +233,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -233,7 +233,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{ {
CheckNTErrors(a->order == b->order, "Wrong tensor orders!"); CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!"); CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!"); CheckNTErrors(a->order >= 0, "TODO!");
int order = a->order; int order = a->order;
int count = 0; int count = 0;
......
...@@ -16,16 +16,16 @@ ...@@ -16,16 +16,16 @@
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01 * $Created by: Li Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2020-02-11
* Paper review rebuttal of ACL2020 will start at this Thursday. So nervous :(
*/ */
#include "../../XTensor.h"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "../shape/IsSameShaped.h" #include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "SumDim.h"
#include "../math/ScaleAndShift.h"
#include "Sub.h" #include "Sub.h"
#include "Sub.cuh"
#include "SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -39,80 +39,7 @@ tensor subtraction c = a - b * \beta ...@@ -39,80 +39,7 @@ tensor subtraction c = a - b * \beta
*/ */
void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors(a && b && c, "Empty tensor input!"); _Sum(a, b, c, -beta);
CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
"Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
if (a == c) {
int P2PAccesible = 0;
#ifdef CUDA_UVA
cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
#endif
if ((a->devID < 0 && b->devID >= 0) ||
(a->devID >= 0 && b->devID < 0) ||
(a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
{
ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
}
else
_CudaSub(a, b, c, beta);
}
else
_CudaSub(a, b, c, beta);
#endif
}
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data;
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] - bp[i] * beta;
cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] - bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] - bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] - bp[i] * beta;
cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] - bp[i] * beta;
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
} }
/* /*
...@@ -136,38 +63,24 @@ keep the result in the tensor a and return nothing ...@@ -136,38 +63,24 @@ keep the result in the tensor a and return nothing
>> b - another tensor >> b - another tensor
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void SubMe(XTensor& a, const XTensor& b, DTYPE beta) void SubMe(XTensor & a, const XTensor & b, DTYPE beta)
{
_Sub(&a, &b, &a, beta);
}
/*
return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
>> a - a tensor
>> b - another tensor for subtraction
*/
int GetSubDimIndex(const XTensor &a, const XTensor &b)
{ {
if(a.order < b.order) if (b.order == 0){
return -1; DTYPE shift = -(b.Get0D() * beta);
if(IsSameShaped(a, b)) _ScaleAndShift(&a, &a, 1.0F, shift);
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
} }
else {
int n = GetBroadcastDimIndex(a, b);
if(hitCount == 1) if (n == -1)
return hitDim; /* call _Sub function */
_Sub(&a, &b, &a, beta);
else if (n >= 0 && n < a.order)
/* call _SumDim function to do the SubDim operation */
_SumDim(&a, &b, &a, n, -beta);
else else
return -1; ShowNTErrors("Something is wrong!");
}
} }
/* /*
...@@ -179,12 +92,17 @@ make a new tensor c to keep the result and return it ...@@ -179,12 +92,17 @@ make a new tensor c to keep the result and return it
>> beta - the scaling factor >> beta - the scaling factor
<< return - the result of tensor subtraction << return - the result of tensor subtraction
*/ */
XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta) XTensor Sub(const XTensor & a, const XTensor & b, DTYPE beta)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetSubDimIndex(a, b); if (b.order == 0){
DTYPE shift = -(b.Get0D() * beta);
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(n == -1){ if(n == -1){
/* call _Sub function */ /* call _Sub function */
...@@ -197,8 +115,8 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -197,8 +115,8 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
} }
} }
else if(n >= 0 && n < a.order){ else if(n >= 0 && n < a.order){
/* call _SubDim function */ /* call _SumDim function to do the SubDim operation */
_SubDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, -beta);
/* tensor connections */ /* tensor connections */
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
...@@ -210,7 +128,7 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -210,7 +128,7 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
return c; return c;
} }
...@@ -222,13 +140,18 @@ tensor subtraction c = a - b * \beta ...@@ -222,13 +140,18 @@ tensor subtraction c = a - b * \beta
>> c - where we put a-b*\beta. we save it in a if c is NULL >> c - where we put a-b*\beta. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta) void Sub(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
{ {
if (!c.isInit || !IsSameShaped(a, c)) { if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a); InitTensorV2(&c, &a);
} }
int n = GetSubDimIndex(a, b); if (b.order == 0){
DTYPE shift = -(b.Get0D() * beta);
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) { if (n == -1) {
/* call _Sub function */ /* call _Sub function */
...@@ -241,8 +164,8 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta) ...@@ -241,8 +164,8 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
} }
} }
else if (n >= 0 && n < a.order) { else if (n >= 0 && n < a.order) {
/* call _SubDim function */ /* call _SumDim function to do the SubDim operation */
_SubDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, -beta);
if (a.enableGrad && b.enableGrad) { if (a.enableGrad && b.enableGrad) {
/* tensor connections */ /* tensor connections */
...@@ -254,6 +177,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta) ...@@ -254,6 +177,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
subtraction of data arrays (CUDA Kernel)
c = a - b * \beta
>> a - A matrix
>> b - another matrix
>> c - where we put a-b
>> size - the size of a/b/c
>> beta - the coefficient
*/
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] - b[i] * beta;
}
/*
tensor subtraction c = a - b * \beta (cuda version)
>> a - a tensor
>> b - another tensor
>> c - where we put a-b*\beta.
>> beta - the scaling factor
*/
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
"Unmatched tensors in addition!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Unmatched tensors in addition!");
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The tensors must be on the same!");
int devIDBackup = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(a->devID);
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
XDevice::SetGPUDevice(devIDBackup);
}
/* subtraction over arrays
tensor subtraction c = a - b * \beta (cuda version) with an input handle
>> devID - device ID (MUST >= 0)
>> handle - cuda handle
>> a - an array
>> b - another array
>> c - where we put a-b
>> size - size of the array
>> beta - the coefficient
*/
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
if (size == 0)
return;
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty arrays in addition!");
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if (c == a) {
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
#else
cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
#endif
}
else {
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(devID, size, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#ifndef __SUB_CUH__
#define __SUB_CUH__
#include "Sub.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* subtraction of data arrays (CUDA Kernel) */
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) */
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) with an input handle */
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SUB_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include <math.h>
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../shape/IsSameShaped.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if (beta == 0) {
_CopyValues(a, c);
return;
}
if (_IsSameShaped(a, b)) {
_Sub(a, b, c, beta);
return;
}
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSubDim(a, b, c, n, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE) {
int num = a->unitNum;
if (stride > 1) {
for (int i = 0, j = 0; i < num; i += stride, j++) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize) * beta;
DTYPE * cp = (DTYPE*)c->data + i;
for (int k = 0; k < stride; k++)
cp[k] = ap[k] - bv;
}
}
else if (stride == 1) {
DTYPE * bp = (DTYPE*)b->data;
for (int i = 0; i < num; i += blockSize) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if (beta == 1.0F) {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j];
}
else {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j] * beta;
}
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor subtraction (do it on site)
keep the result in the input tensor and return nothing
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
{
_SubDim(a, b, a, n, beta);
}
/*
tensor subtraction (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
<< return - the result tensor by tensor subtraction
*/
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
return c;
}
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "SubDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor subtraction of a tensor and a row vector
c = a - b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
/*
tensor subtraction (cuda version)
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_CUH__
#define __SUBDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting (cuda version) */
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_H__
#define __SUBDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "../../XBLAS.h" #include "../../XBLAS.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "../shape/IsSameShaped.h" #include "../shape/IsSameShaped.h"
#include "../math/ScaleAndShift.h"
#include "Sum.h" #include "Sum.h"
#include "Sum.cuh" #include "Sum.cuh"
#include "SumDim.h" #include "SumDim.h"
...@@ -93,7 +94,38 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -93,7 +94,38 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
AXPY(a->unitNum, beta, bp, 1, cp, 1); AXPY(a->unitNum, beta, bp, 1, cp, 1);
return; return;
} }
#else
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
}
}
#endif #endif
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int * ap = (int*)a->data;
int * bp = (int*)b->data;
int * cp = (int*)c->data;
/* unrolling */ /* unrolling */
int num = a->unitNum; int num = a->unitNum;
if (num % 4 == 0) { if (num % 4 == 0) {
...@@ -149,38 +181,58 @@ keep the result in the tensor a and return nothing ...@@ -149,38 +181,58 @@ keep the result in the tensor a and return nothing
>> b - another tensor >> b - another tensor
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void SumMe(XTensor& a, const XTensor& b, DTYPE beta) void SumMe(XTensor & a, const XTensor & b, DTYPE beta)
{ {
if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
_ScaleAndShift(&a, &a, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1)
/* call _Sum function */
_Sum(&a, &b, &a, beta); _Sum(&a, &b, &a, beta);
else if (n >= 0 && n < a.order)
/* call _SumDim function */
_SumDim(&a, &b, &a, n, beta);
else
ShowNTErrors("Something is wrong!");
}
} }
/* /*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h) return a dimension if the operation is performed as broadcast(e.g. SumDim function)
>> a - a tensor >> a - a tensor
>> b - another tensor for sum >> b - another tensor for operation
*/ */
int GetSumDimIndex(const XTensor &a, const XTensor &b) int GetBroadcastDimIndex(const XTensor & a, const XTensor & b)
{ {
if(a.order < b.order) if(a.order < b.order)
return -1; return -1;
if(IsSameShaped(a, b)) if(IsSameShaped(a, b))
return -1; return -1;
int hitCount = 0;
int hitDim = -1; int hitDim = -1;
bool isHit = false;
for(int i = 0; i < b.order; i++){ for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1) if(b.dimSize[b.order - 1 - i] == 1)
continue; continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){ else {
hitCount++; if (isHit == true)
hitDim = a.order - b.order + i; return -1;
else
isHit = true;
for (int j = 0; j < a.order; j++){
if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - j]){
hitDim = a.order - 1 - j;
break;
}
}
} }
} }
if(hitCount == 1)
return hitDim; return hitDim;
else
return -1;
} }
/* /*
...@@ -192,12 +244,17 @@ make a new tensor c to keep the result and return it ...@@ -192,12 +244,17 @@ make a new tensor c to keep the result and return it
>> beta - the scaling factor >> beta - the scaling factor
<< return - the result of tensor summation << return - the result of tensor summation
*/ */
XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) XTensor Sum(const XTensor & a, const XTensor & b, DTYPE beta)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
int n = GetSumDimIndex(a, b); if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(n == -1){ if(n == -1){
/* call _Sum function */ /* call _Sum function */
...@@ -223,7 +280,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -223,7 +280,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
return c; return c;
} }
...@@ -234,13 +291,18 @@ tensor summation c = a + b * \beta ...@@ -234,13 +291,18 @@ tensor summation c = a + b * \beta
>> b - another tensor >> b - another tensor
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta) void Sum(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
{ {
if (!c.isInit || !IsSameShaped(a, c)) { if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a); InitTensorV2(&c, &a);
} }
int n = GetSumDimIndex(a, b); if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) { if (n == -1) {
/* call _Sum function */ /* call _Sum function */
...@@ -266,6 +328,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta) ...@@ -266,6 +328,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -45,6 +45,15 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta) ...@@ -45,6 +45,15 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
c[i] = a[i] + b[i] * beta; c[i] = a[i] + b[i] * beta;
} }
__global__
void KernelADD(int * a, int * b, int * c, int size, int beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * beta;
}
/* /*
tensor summation c = a + b * \beta (cuda version) tensor summation c = a + b * \beta (cuda version)
>> a - a tensor >> a - a tensor
...@@ -100,6 +109,17 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -100,6 +109,17 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta); KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
} }
} }
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelADD << <blocks, threads >> >((int*)a->data, (int*)b->data, (int*)c->data, a->unitNum, (int)beta);
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -26,6 +26,9 @@ ...@@ -26,6 +26,9 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* return a dimension if the operation is performed as broadcast(e.g. SumDim function) */
int GetBroadcastDimIndex(const XTensor & a, const XTensor & b);
/* tensor summation c = a + b * \beta */ /* tensor summation c = a + b * \beta */
void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
......
...@@ -220,7 +220,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta ...@@ -220,7 +220,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
{ {
CheckNTErrors(a->order == b->order, "Wrong tensor orders!"); CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!"); CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!"); CheckNTErrors(a->order >= 0, "TODO!");
int order = a->order; int order = a->order;
int count = 0; int count = 0;
......
...@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
#endif #endif
CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; if (a->dataType == DEFAULT_DTYPE) {
DTYPE * db = (DTYPE*)b->data; DTYPE* d = (DTYPE*)a->data;
DTYPE* db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) { for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper) if (d[i] > upper)
db[i] = upper; db[i] = upper;
...@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
else else
db[i] = d[i]; db[i] = d[i];
} }
}
else if (a->dataType == X_INT) {
int* d = (int*)a->data;
int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
}
}
else
ShowNTErrors("TODO!");
} }
/* /*
......
...@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel) ...@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border >> upper - the upper border
>> size - size of the data array >> size - size of the data array
*/ */
template <class T>
__global__ __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size) void KernelClip(T * a, T * b, T lower, T upper, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -90,10 +91,16 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -90,10 +91,16 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) { if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum); KernelClip<DTYPE> << <blocks, threads >> >((DTYPE *)a->data, (DTYPE *)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip<int> << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
} }
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum); ShowNTErrors("TODO!");
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */ /* set each entry to its clip value (CUDA Kernel) */
__global__ template <class T> __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size); void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/ /* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__ __global__
......
...@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
return; return;
} }
#endif #endif
if (a->dataType == DEFAULT_DTYPE) {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
/* sparse tensor */ /* sparse tensor */
if(a->isSparse){ if(a->isSparse) {
int num = a->unitNumNonZero; int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int); char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int); char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
...@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
} }
} }
/* dense tensor */ /* dense tensor */
else{ else {
DTYPE * va = (DTYPE*)a->data; DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data; DTYPE * vb = (DTYPE*)b->data;
for(int i = 0; i < b->unitNum; i++){ for(int i = 0; i < b->unitNum; i++){
...@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++; vb++;
} }
} }
}
else if (a->dataType == X_INT) {
/* sparse tensor */
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
int * v = (int*)f;
int * vb = (int*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int);
}
}
/* dense tensor */
else {
int * va = (int*)a->data;
int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
else
ShowNTErrors("TODO!");
} }
/* /*
......
...@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) ...@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it >> scale - how much we want to scale it
>> shift - how much we want to shift it >> shift - how much we want to shift it
*/ */
template<bool isUnitScale, bool isZeroShift> template<class T, bool isUnitScale, bool isZeroShift>
__global__ __global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift) void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift ...@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){ if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0) if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0) else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0) else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
} }
else if(a->dataType == X_FLOAT16){ else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale); unsigned short scale2 = FloatToFloat16(scale);
......
...@@ -146,7 +146,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, ...@@ -146,7 +146,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions"); CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
} }
else { else {
CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions"); CheckNTErrors(t->GetDim(i) >= indexSize * copyNum, "Unmatched dimensions");
} }
} }
......
...@@ -43,12 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -43,12 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!"); CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!"); CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->dataType == X_INT), "The index tensor should be INT type!");
CheckNTErrors((srcIndex->order == s->order), "index's order should be the same with source's");
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) { if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim); _CudaGather(s, t, srcIndex, dim);
return; return;
} }
#endif #endif
int stride = 1;
int blockNum = 1;
for (int i = dim + 1; i < s->order; ++i)
{
stride *= s->GetDim(i);
}
for (int i = 0; i < dim; ++i)
{
blockNum *= s->GetDim(i);
}
int indexStrideNum = srcIndex->GetDim(dim);
int srcStrideNum = stride * s->GetDim(dim);
int tgtBlockSize = stride * indexStrideNum;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndexData = (int*)srcIndex->data;
for (int blockIndex = 0; blockIndex < blockNum; ++blockIndex)
{
for (int i = 0; i < indexStrideNum; i++) {
for (int j = 0; j < stride; j++)
{
int sIndex = sIndexData[i * stride + blockIndex * indexStrideNum + j] * stride + blockIndex * srcStrideNum + j;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
int tIndex = i * stride + blockIndex * tgtBlockSize + j;
tData[tIndex] = sData[sIndex];
}
}
}
} }
/* /*
...@@ -64,13 +95,14 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -64,13 +95,14 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!"); CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
if (s->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex); _CudaGather(s, t, srcIndex);
return; #else
} ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
}
else {
int stride = 1; int stride = 1;
int indexSize = 1; int indexSize = 1;
...@@ -83,9 +115,11 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -83,9 +115,11 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
for (int i = 0; i < indexSize; i++) { for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride; int sIndex = sIndexData[i] * stride;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
for (int j = 0; j < stride; j++) for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j]; tData[i * stride + j] = sData[sIndex + j];
} }
}
} }
/* /*
......
...@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version) ...@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version)
>> blockNum - block size of data >> blockNum - block size of data
*/ */
__global__ __global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum) void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum, int srcStrideNum)
{ {
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y; int idy = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st ...@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock; for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size; i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) { i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]]; tData[i] = sData[sIndex[i] * stride + stride * srcStrideNum * blockIndex + offsetInBlock];
} }
} }
...@@ -126,14 +126,30 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -126,14 +126,30 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
int * sIndex = NULL; int * sIndex = NULL;
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ? sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else else {
int * sIndexData = new int[sizeof(int) * indexSize];
XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = (int *)srcIndex->data; sIndex = (int *)srcIndex->data;
delete[] sIndexData;
}
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride); KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
...@@ -163,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -163,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int blockNum = 1; int blockNum = 1;
int indexSize = srcIndex->unitNum; int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim]; int strideNum = srcIndex->dimSize[dim];
int srcStrideNum = s->dimSize[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i]; blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++) for (int i = dim + 1; i < srcIndex->order; i++)
...@@ -170,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -170,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int * sIndex = NULL; int * sIndex = NULL;
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ? sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else else {
int * sIndexData = new int[sizeof(int) * indexSize];
XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = (int *)srcIndex->data; sIndex = (int *)srcIndex->data;
delete[] sIndexData;
}
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum, srcStrideNum);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim) ...@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength); \ vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength); \
} \ } \
for (int j = 1; j < strideNum / 32; j++) { \ for (int j = 1; j < strideNum / 32; j++) { \
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength); \ const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength); \
vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength)); \ vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength)); \
vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength)); \ vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength)); \
vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength)); \ vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength)); \
...@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim) ...@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)
else { \ else { \
/* data is separated */ \ /* data is separated */ \
for(int i = 0; i < blockNum; i++){ \ for(int i = 0; i < blockNum; i++){ \
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){ \ for(int j = 0; j < stride / 32; j++){ \
DTYPE * ip = (DTYPE*)input->data + blockSize * i; \ DTYPE * ip = (DTYPE*)input->data + blockSize * i; \
DTYPE * op = (DTYPE*)output->data + stride * i; \ DTYPE * op = (DTYPE*)output->data + stride * i; \
VectorBuffer vecBuf[4]; \ VectorBuffer vecBuf[4]; \
......
...@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim) ...@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
int num = input->dimSize[dim]; int num = input->dimSize[dim];
_ReduceSum(input, output, dim); _ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0); _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
} }
/* /*
......
...@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias); vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
} }
for(int j = 1; j < strideNum / 32; j++){ for(int j = 1; j < strideNum / 32; j++){
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength); const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias); vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias); vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias); vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
...@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{ } else{
//data is separated //data is separated
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){ for(int j = 0; j < stride / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i; DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i; DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL; DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
...@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} }
VectorBuffer vecBuf[4]; VectorBuffer vecBuf[4];
for(int k = 0; k < 4; k++){ for(int k = 0; k < 4; k++){
vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE)); vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
} }
for(int k = 1; k < strideNum; k++){ for(int k = 1; k < strideNum; k++){
DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength; DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
......
...@@ -16,11 +16,12 @@ ...@@ -16,11 +16,12 @@
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27 * $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
*/ */
#include "ReduceSumAll.h" #include "ReduceSumAll.h"
#include "ReduceSum.h" #include "ReduceSum.h"
#include "../../XName.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -42,55 +43,70 @@ int * getDimSize(const XTensor * tensor, int n) ...@@ -42,55 +43,70 @@ int * getDimSize(const XTensor * tensor, int n)
/* /*
sum all the items of the tensor (It should be optimized!) sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << target - the total summation
*/ */
DTYPE _ReduceSumAll(const XTensor * source) void _ReduceSumAll(const XTensor * source, XTensor * target)
{ {
int dims[2] = {1, source->unitNum}; CheckNTErrors((source->devID == target->devID || (source->devID < 0 && target->devID < 0)),
int one = 1; "This code must be run on the same device!");
CheckNTErrors((source && target), "Empty input or output tensors!");
CheckNTErrors((target->order == 0), "Incorrect target tensor sizes!");
CheckNTErrors((target->unitNum == 1), "Illegal dimension to reduce!");
CheckNTErrors((source->dataType == target->dataType), "Unmatched data types!");
XTensor * all = NewTensorBufV2(2, dims, source->dataType, source->denseRatio, source->devID, source->mem); int dims[1] = {source->unitNum};
XTensor * result = NewTensorBufV2(1, &one, source->dataType, 1.0F, source->devID, source->mem);
_CopyValues(source, all); XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_ReduceSum(all, result, 1);
DTYPE r = result->Get1D(0); _CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(result);
DelTensorBuf(all); DelTensorBuf(all);
}
return r; /*
sum all the items of the tensor (It should be optimized!)
/*int order = source->order; >> source - the inpute tensor
DTYPE summation; << value - the total summation
*/
XTensor * big = NewTensor(source); void _ReduceSumAll(const XTensor * source, DTYPE * value)
_CopyValues(source, big); {
for(int i = order - 1; i >= 0; i--) { int * dimSize = new int[MAX_TENSOR_DIM_NUM];
if(i == 0) float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
big->Reshape(1, big->unitNum); XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
int leadingDim = big->order - 1; /* call _ReduceSum function */
int * dimSize; _ReduceSumAll(source, target);
dimSize = getDimSize(big, leadingDim); *value = target->Get0D();
XTensor * little = NewTensorV2(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
_ReduceSum(big, little, leadingDim); DelTensorBuf(target);
}
delete big; /*
delete dimSize; sum all the items of the tensor
>> source - the inpute tensor
<< return - the total summation
*/
XTensor ReduceSumAll(const XTensor & source)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source.isSparse) ? 1.0F : source.denseRatio;
XTensor target(0, dimSize, source.dataType, dr, source.devID, source.mem);
target.SetTMPFlag();
big = NewTensor(little); /* call _ReduceSum function */
_CopyValues(little, big); _ReduceSumAll(&source, &target);
delete little; /* tensor connection */
if (source.enableGrad) {
XLink::MakeLink(&source, NULL, &target, REDUCE_REDUCESUMALL);
} }
summation = big->Get1D(0);
delete big;
return summation;*/ /* destroy variables */
delete[] dimSize;
return target;
} }
/* /*
...@@ -98,9 +114,11 @@ sum all the items of the tensor ...@@ -98,9 +114,11 @@ sum all the items of the tensor
>> source - the inpute tensor >> source - the inpute tensor
<< return - the total summation << return - the total summation
*/ */
DTYPE ReduceSumAll(const XTensor & source) DTYPE ReduceSumAllValue(const XTensor & source)
{ {
return _ReduceSumAll(&source); XTensor target;
target = ReduceSumAll(source);
return target.Get0D();
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
*/ */
/* /*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27 * $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
*/ */
...@@ -28,10 +28,16 @@ ...@@ -28,10 +28,16 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE _ReduceSumAll(const XTensor * source); void _ReduceSumAll(const XTensor * source, XTensor * target);
/* sum all the items of the tensor */ /* sum all the items of the tensor */
DTYPE ReduceSumAll(const XTensor & source); void _ReduceSumAll(const XTensor * source, DTYPE * target);
/* sum all the items of the tensor */
XTensor ReduceSumAll(const XTensor & source);
/* sum all the items of the tensor */
DTYPE ReduceSumAllValue(const XTensor & source);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim) ...@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim)
if (i < dim) if (i < dim)
dimSize[i] = tensor->GetDim(i); dimSize[i] = tensor->GetDim(i);
else if (i > dim) else if (i > dim)
dimSize[i] = tensor->GetDim(i-1); dimSize[i] = tensor->GetDim(i - 1);
else if (i == dim) else if (i == dim)
dimSize[i] = count; dimSize[i] = count;
} }
...@@ -160,7 +160,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim) ...@@ -160,7 +160,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
if (i < dim) if (i < dim)
dimSize[i] = tensor->GetDim(i); dimSize[i] = tensor->GetDim(i);
else if (i > dim) else if (i > dim)
dimSize[i] = tensor->GetDim(i-1); dimSize[i] = tensor->GetDim(i - 1);
else if (i == dim) else if (i == dim)
dimSize[i] = count; dimSize[i] = count;
} }
......
...@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _Stack(const TensorList * smalls, XTensor * t, int dim); void _Stack(const TensorList * smalls, XTensor * t, int dim);
/* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */ /* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */
XTensor Stack(const TensorList &list, int leadingDim); XTensor Stack(const TensorList &list, int dim);
/* stack small tensors into a big tensor along with a dimension */ /* stack small tensors into a big tensor along with a dimension */
void Stack(const TensorList &smalls, XTensor &t, int dim); void Stack(const TensorList &smalls, XTensor &t, int dim);
......
...@@ -35,8 +35,9 @@ get the top-k items along a given dimension ...@@ -35,8 +35,9 @@ get the top-k items along a given dimension
>> index - index of the top-k items >> index - index of the top-k items
>> dim - the dimension along which the sorting is performed >> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting >> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/ */
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
{ {
dim = MODX(dim, a->order); dim = MODX(dim, a->order);
...@@ -58,7 +59,7 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -58,7 +59,7 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
if (a->devID >= 0 || b->devID >= 0) { if (a->devID >= 0 || b->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaTopK(a, b, index, dim, k); _CudaTopK(a, b, index, dim, k, isSorted);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
...@@ -116,15 +117,16 @@ get the top-k items along a given dimension ...@@ -116,15 +117,16 @@ get the top-k items along a given dimension
>> index - index of the top-k items >> index - index of the top-k items
>> dim - the dimension along which the sorting is performed >> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting >> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/ */
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k) void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted)
{ {
dim = MODX(dim, a.order); dim = MODX(dim, a.order);
if(a.dimSize[dim] <= k) if(a.dimSize[dim] <= k)
_Sort(&a, &b, &index, dim); _Sort(&a, &b, &index, dim);
else else
_TopK(&a, &b, &index, dim, k); _TopK(&a, &b, &index, dim, k, isSorted);
/* tensor connection */ /* tensor connection */
//TensorList list(2); //TensorList list(2);
......
...@@ -374,9 +374,10 @@ get the top-k items ...@@ -374,9 +374,10 @@ get the top-k items
>> minValue - min value of an item >> minValue - min value of an item
>> output - the output data array >> output - the output data array
>> index - the output index array >> index - the output index array
>> isSorted - indicates whether the k items are sorted
*/ */
template<class T> __global__ template<class T> __global__
void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index) void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index, bool isSorted)
{ {
__shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 512 * sizeof(T)) / sizeof(CudaHeapNode<T>)]; __shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 512 * sizeof(T)) / sizeof(CudaHeapNode<T>)];
__shared__ T eachHeapMaxValue[512]; __shared__ T eachHeapMaxValue[512];
...@@ -479,11 +480,24 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi ...@@ -479,11 +480,24 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
int offset = stride * k * blockIndex + offsetInBlock; int offset = stride * k * blockIndex + offsetInBlock;
T * dOutput = output + offset; T * dOutput = output + offset;
int * indexOutput = index + offset; int * indexOutput = index + offset;
for (int q = 0; q < k; ++q){ if (isSorted)
{
for (int q = k - 1; q >= 0; q--) {
dOutput[stride * q] = ansHeapData.items[0].value;
indexOutput[stride * q] = ansHeapData.items[0].index;
ansHeapData.items[0] = ansHeapData.items[ansHeapData.count - 1];
ansHeapData.count--;
ansHeapData.Down(0);
}
}
else
{
for (int q = 0; q < k; ++q) {
dOutput[stride * q] = ansHeapData.items[q].value; dOutput[stride * q] = ansHeapData.items[q].value;
indexOutput[stride * q] = ansHeapData.items[q].index; indexOutput[stride * q] = ansHeapData.items[q].index;
} }
} }
}
} }
...@@ -803,8 +817,9 @@ get the top-k items along a given dimension ...@@ -803,8 +817,9 @@ get the top-k items along a given dimension
>> index - index of the top-k items >> index - index of the top-k items
>> dim - the dimension along which the sorting is performed >> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting >> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/ */
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
{ {
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!"); CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!");
CheckNTErrors((a->order == b->order), "Unmatched input tensors!"); CheckNTErrors((a->order == b->order), "Unmatched input tensors!");
...@@ -846,7 +861,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -846,7 +861,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
if (a->dataType == DEFAULT_DTYPE) { if (a->dataType == DEFAULT_DTYPE) {
KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN, ((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
(DTYPE*)b->data, (int*)index->data); (DTYPE*)b->data, (int*)index->data, isSorted);
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
...@@ -882,6 +897,10 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -882,6 +897,10 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
KernelTopKRadixSelect<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum); KernelTopKRadixSelect<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
deconvert2floatV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride); deconvert2floatV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
if (isSorted)
{
ShowNTErrors("TODO!");
}
} }
} }
......
...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k); void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,10 +27,10 @@ ...@@ -27,10 +27,10 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k); void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted = false);
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k); void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -78,6 +78,8 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem) ...@@ -78,6 +78,8 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
if(m->mem == NULL) if(m->mem == NULL)
delete[] (char*)m->data; delete[] (char*)m->data;
else
m->mem->Release(m->data, m->GetDataSizeInChar(), m->signature);
m->dataHost = NULL; m->dataHost = NULL;
m->data = GPUData + p; m->data = GPUData + p;
...@@ -94,7 +96,36 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem) ...@@ -94,7 +96,36 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
#endif #endif
} }
/* copy the data from GPU memory to CPU memory */ /* copy the data from GPU memory to CPU memory (memory pool) */
void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem)
{
#ifdef USE_CUDA
CheckNTErrors((tensor->devID >= 0), "Cannot do cpu-flush on matrices that are already on CPU.");
/* compute the requried memory size */
int size = 0;
if (tensor->isSparse)
size = sizeof(int) + (sizeof(int) + tensor->unitSize) * tensor->unitNumNonZero;
else
size = tensor->unitSize * tensor->unitNum;
char * CPUData = CPUMem != NULL ? (char*)CPUMem->Alloc(CPUMem->devID, size):
(char*)XMemAlloc(devID, size);
/* copy from CPU memory to GPU memory */
cudaMemcpy(CPUData, tensor->data, size, cudaMemcpyDeviceToHost);
if (tensor->dataHost != NULL)
delete[](char*)tensor->dataHost;
tensor->dataHost = NULL;
tensor->mem->Release(tensor->data, tensor->GetDataSizeInChar(), tensor->signature);
tensor->data = CPUData;
tensor->devID = CPUMem != NULL ? CPUMem->devID : devID;
tensor->mem = CPUMem;
#endif
}
/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
void CudaGPUToCPUFlush(XTensor * tensor) void CudaGPUToCPUFlush(XTensor * tensor)
{ {
CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type."); CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
......
...@@ -31,7 +31,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -31,7 +31,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* flush a list of XTensor to GPU memory */ /* flush a list of XTensor to GPU memory */
void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem); void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem);
/* copy the data from GPU memory to CPU memory */ /* copy the data from GPU memory to CPU memory (memory pool) */
void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem);
/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
void CudaGPUToCPUFlush(XTensor * tensor); void CudaGPUToCPUFlush(XTensor * tensor);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -293,7 +293,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -293,7 +293,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!"); CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
CheckNTErrors((gold != NULL), "The gold standard cannot be empty!"); CheckNTErrors((gold != NULL || lossName == NOLOSS), "The gold standard cannot be empty!");
if(leadDim < 0) if(leadDim < 0)
leadDim = y->order - 1; leadDim = y->order - 1;
......
...@@ -33,6 +33,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim); ...@@ -33,6 +33,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */ /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
XTensor LogSoftmax(const XTensor &x, int leadDim); XTensor LogSoftmax(const XTensor &x, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim); void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */ /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
......
...@@ -358,21 +358,21 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -358,21 +358,21 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim); _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossBuf); _ReduceSumAll(lossBuf, &loss);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; DTYPE nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossBuf->unitNum; nonZeroNum = (DTYPE)lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem); XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); _ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp); DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / nonZeroNum;
} }
else if(reduceWay == REDUCE_SUM) { else if(reduceWay == REDUCE_SUM) {
/* don't need to do anything */ /* don't need to do anything */
...@@ -675,8 +675,9 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, ...@@ -675,8 +675,9 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
if(padding != NULL) { if(padding != NULL) {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp); DTYPE nonZeroNum;
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); _ReduceSumAll(tmp, &nonZeroNum);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
delete tmp; delete tmp;
} }
else { else {
......
...@@ -123,21 +123,21 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -123,21 +123,21 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
_CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim); _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossBuf); _ReduceSumAll(lossBuf, &loss);
if(reduceWay == REDUCE_MEAN) { if(reduceWay == REDUCE_MEAN) {
int nonZeroNum; DTYPE nonZeroNum;
if(padding == NULL) { if(padding == NULL) {
nonZeroNum = lossBuf->unitNum; nonZeroNum = (DTYPE)lossBuf->unitNum;
} }
else { else {
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem); XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp); _ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp); DelTensorBuf(tmp);
} }
loss = loss / (DTYPE)nonZeroNum; loss = loss / nonZeroNum;
} }
else if(reduceWay == REDUCE_SUM) { else if(reduceWay == REDUCE_SUM) {
/* don't need to do anything */ /* don't need to do anything */
...@@ -199,8 +199,9 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, ...@@ -199,8 +199,9 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
if(padding != NULL) { if(padding != NULL) {
XTensor * tmp = NewTensor(padding); XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp); DTYPE nonZeroNum;
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum); _ReduceSumAll(tmp, &nonZeroNum);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
delete tmp; delete tmp;
} }
else { else {
......
...@@ -606,7 +606,7 @@ bool TestCopyIndexed4() ...@@ -606,7 +606,7 @@ bool TestCopyIndexed4()
/* /*
case 5: copy indexed sub-tensors case 5: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1, In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
srcIndex = [0, 1], tgtIndex = [0, 2], copyNum = 2. srcIndex = [0, 1], tgtIndex = [0, 2], copyNum = 2.
*/ */
bool TestCopyIndexed5() bool TestCopyIndexed5()
...@@ -622,7 +622,7 @@ bool TestCopyIndexed5() ...@@ -622,7 +622,7 @@ bool TestCopyIndexed5()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 2, 2) */ /* a output tensor of size (3, 2, 4) */
int tOrder = 3; int tOrder = 3;
int * tDimSize = new int[tOrder]; int * tDimSize = new int[tOrder];
tDimSize[0] = 3; tDimSize[0] = 3;
...@@ -749,6 +749,152 @@ bool TestCopyIndexed5() ...@@ -749,6 +749,152 @@ bool TestCopyIndexed5()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 6: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
*/
bool TestCopyIndexed6()
{
/* a input tensor of size (3, 2, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 2;
sDimSize[2] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 2, 4) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 2;
tDimSize[2] = 4;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE tData[3][2][4] = { { {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F} },
{ {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F}},
{ {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F} } };
DTYPE answer[3][2][4] = { { {2.0F, 5.0F, 5.0F, 0.0F},
{3.0F, 5.0F, 5.0F, 2.0F} },
{ {4.0F, 5.0F, 5.0F, 1.0F},
{2.0F, 5.0F, 5.0F, 3.0F}},
{ {2.0F, 5.0F, 5.0F, -1.0F},
{0.0F, 5.0F, 5.0F, 1.0F} } };
int dim = 2;
int indexSize = 2;
int srcIndex[2] = {0, 2};
int tgtIndex[2] = {3, 0};
int copyNum = 1;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t1 = NewTensorV2(tOrder, tDimSize);
XTensor * t2 = NewTensorV2(tOrder, tDimSize);
XTensor * sIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s->SetData(sData, sUnitNum);
t1->SetData(tData, tUnitNum);
t2->SetData(tData, tUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
/* check results */
cpuTest = _CheckData(t1, answer, tUnitNum) &&
_CheckData(t2, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetData(tData, tUnitNum);
tGPU2->SetData(tData, tUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
/* check results */
gpuTest = _CheckData(tGPU1, answer, tUnitNum) &&
_CheckData(tGPU2, answer, tUnitNum);
/* destroy variables */
delete s;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -805,6 +951,15 @@ bool TestCopyIndexed() ...@@ -805,6 +951,15 @@ bool TestCopyIndexed()
else else
XPRINT(0, stdout, ">> case 5 passed!\n"); XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 5 test */
caseFlag = TestCopyIndexed6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: matrix multiplication of the two tensors. case 1: matrix multiplication of the two tensors.
In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS. In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMulBatched1() bool TestMatrixMulBatched1()
{ {
......
...@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: element-wise product of two tensors case 1: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0. In this case, (2, 2) * (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/ */
bool TestMultiply1() bool TestMultiply1()
{ {
...@@ -149,6 +149,131 @@ bool TestMultiply1() ...@@ -149,6 +149,131 @@ bool TestMultiply1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 3, 4) * (2, 1, 1) -> (2, 3, 4), alpha=0.
*/
bool TestMultiply2()
{
/* a source tensor of size (2, 3, 4) */
int sOrder1 = 3;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
sDimSize1[2] = 4;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 1, 1) */
int sOrder2 = 3;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 1;
sDimSize2[2] = 1;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 3, 4) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
tDimSize[2] = 4;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F} },
{ {3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F} } };
DTYPE sData2[2][1][1] = { { {1.0F} },
{ {-1.0F} } };
DTYPE answer[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F} },
{ {-3.0F, -2.0F, -1.0F, 0.0F},
{0.0F, -1.0F, -2.0F, -3.0F},
{-3.0F, -2.0F, -1.0F, 0.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensorV2(sOrder1, sDimSize1);
XTensor * s2 = NewTensorV2(sOrder2, sDimSize2);
XTensor * tMe = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
tMe->SetData(sData1, sUnitNum1);
s2->SetData(sData2, sUnitNum2);
/* call Multiply function */
MultiplyMe(*tMe, *s2, 0);
tUser = Multiply(*s1, *s2);
/* check results */
cpuTest = _CheckData(tMe, answer, 1e-4, tUnitNum) &&
_CheckData(&tUser, answer, 1e-4, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensorV2(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensorV2(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
/* call Multiply function */
MultiplyMe(*tMeGPU, *sGPU2, 0);
tUserGPU = Multiply(*sGPU1, *sGPU2);
/* check results */
gpuTest = _CheckData(tMeGPU, answer, tUnitNum, 1e-4F) &&
_CheckData(&tUserGPU, answer, tUnitNum, 1e-4F);
/* destroy variables */
delete s1;
delete s2;
delete tMe;
delete sGPU1;
delete sGPU2;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s1;
delete s2;
delete tMe;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -170,6 +295,16 @@ bool TestMultiply() ...@@ -170,6 +295,16 @@ bool TestMultiply()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestMultiply2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -141,6 +141,90 @@ bool TestReduceMax1() ...@@ -141,6 +141,90 @@ bool TestReduceMax1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: get the max value of the items along a dimension of the scalar tensor.
In this case,
(4) -> scalar, dim = 0
*/
bool TestReduceMax2()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 5.0F, 2.0F, 3.0F};
DTYPE answer[1] = {5.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceMax function */
_ReduceMax(s, t, 0);
tUser = ReduceMax(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
tGPU->SetZeroAll();
/* call ReduceMax function */
_ReduceMax(sGPU, tGPU, 0);
tUserGPU = ReduceMax(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -161,6 +245,15 @@ bool TestReduceMax() ...@@ -161,6 +245,15 @@ bool TestReduceMax()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMax2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -136,6 +136,85 @@ bool TestReduceMean1() ...@@ -136,6 +136,85 @@ bool TestReduceMean1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: get the mean value along a dimension of the scalar tensor */
bool TestReduceMean2()
{
/* a tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE answer[1] = {1.5F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceMean function */
_ReduceMean(s, t, 0);
tUser = ReduceMean(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call ReduceMean function */
_ReduceMean(sGPU, tGPU, 0);
tUserGPU = ReduceMean(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -156,6 +235,15 @@ bool TestReduceMean() ...@@ -156,6 +235,15 @@ bool TestReduceMean()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMean2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* other cases test */ ///* other cases test */
///* ///*
//TODO!! //TODO!!
......
...@@ -607,6 +607,89 @@ bool TestReduceSum6() ...@@ -607,6 +607,89 @@ bool TestReduceSum6()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 7: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(4) -> scalar, dim = 0
*/
bool TestReduceSum7()
{
/* a tensor of size (2, 4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a scalar */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE answer[1] = {6.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(s, t, 0);
tUser = ReduceSum(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 0);
tUserGPU = ReduceSum(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
...@@ -673,6 +756,15 @@ bool TestReduceSum() ...@@ -673,6 +756,15 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 6 passed!\n"); XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestReduceSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -55,7 +55,7 @@ bool TestReduceSumAll1() ...@@ -55,7 +55,7 @@ bool TestReduceSumAll1()
s->SetData(sData, sUnitNum); s->SetData(sData, sUnitNum);
/* call ReduceSumAll function */ /* call ReduceSumAll function */
summation = _ReduceSumAll(s); summation = ReduceSumAllValue(*s);
/* check results */ /* check results */
cpuTest = (fabs(answer - summation) < 1e-4F); cpuTest = (fabs(answer - summation) < 1e-4F);
...@@ -71,7 +71,7 @@ bool TestReduceSumAll1() ...@@ -71,7 +71,7 @@ bool TestReduceSumAll1()
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
/* call ReduceSumAll function */ /* call ReduceSumAll function */
summation = _ReduceSumAll(sGPU); summation = ReduceSumAllValue(*sGPU);
/* check results */ /* check results */
gpuTest = (fabs(answer - summation) < 1e-4F); gpuTest = (fabs(answer - summation) < 1e-4F);
......
...@@ -240,6 +240,104 @@ bool TestReduceSumSquared2() ...@@ -240,6 +240,104 @@ bool TestReduceSumSquared2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 3: squared sum of the items along a dimension of the scalar tensor.
For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (4) -> scalar, dim = 0.
*/
bool TestReduceSumSquared3()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
/* a shift tensor of size (1) */
int shiftOrder = 0;
int * shiftDimSize = new int[MAX_TENSOR_DIM_NUM];
int shiftUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE shiftData[1] = {-1.0F};
DTYPE answer[1] = {30.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor * shift = NewTensorV2(shiftOrder, shiftDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
shift->SetData(shiftData, shiftUnitNum);
t->SetZeroAll();
/* call ReduceSumSquared function */
_ReduceSumSquared(s, t, 0, shift);
tUser = ReduceSumSquared(*s, 0, *shift);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU = NewTensorV2(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU->SetData(shiftData, shiftUnitNum);
tGPU->SetZeroAll();
/* call ReduceSumSquared function */
_ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
tUserGPU = ReduceSumSquared(*sGPU, 0, *shiftGPU);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete shift;
delete sGPU;
delete tGPU;
delete shiftGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] shiftDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete shift;
delete[] sDimSize;
delete[] tDimSize;
delete[] shiftDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -264,10 +362,19 @@ bool TestReduceSumSquared() ...@@ -264,10 +362,19 @@ bool TestReduceSumSquared()
caseFlag = TestReduceSumSquared2(); caseFlag = TestReduceSumSquared2();
if (!caseFlag) { if (!caseFlag) {
returnFlag = false; returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
} }
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestReduceSumSquared3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */ /* other cases test */
/* /*
......
...@@ -132,6 +132,104 @@ bool TestReduceVariance1() ...@@ -132,6 +132,104 @@ bool TestReduceVariance1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/*
case 2: variance of the items along a dimension of the scalar tensor.
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
In this case, (4) -> scalar, dim = 0.
*/
bool TestReduceVariance2()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
/* a mean scalar tensor */
int meanOrder = 0;
int * meanDimSize = new int[MAX_TENSOR_DIM_NUM];
int meanUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE meanData[1] = {1.5F};
DTYPE answer[1] = {1.25F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor * mean = NewTensorV2(meanOrder, meanDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
mean->SetData(meanData, meanUnitNum);
t->SetZeroAll();
/* call ReduceVariance function */
_ReduceVariance(s, t, 0, mean);
tUser = ReduceVariance(*s, 0, *mean);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * meanGPU = NewTensorV2(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
meanGPU->SetData(meanData, meanUnitNum);
tGPU->SetZeroAll();
/* call ReduceVariance function */
_ReduceVariance(sGPU, tGPU, 0, meanGPU);
tUserGPU = ReduceVariance(*sGPU, 0, *meanGPU);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete mean;
delete sGPU;
delete tGPU;
delete meanGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] meanDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete mean;
delete[] sDimSize;
delete[] tDimSize;
delete[] meanDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -152,6 +250,15 @@ bool TestReduceVariance() ...@@ -152,6 +250,15 @@ bool TestReduceVariance()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceVariance2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -32,7 +32,7 @@ spread a collection tensor to source tensor. ...@@ -32,7 +32,7 @@ spread a collection tensor to source tensor.
*/ */
bool TestSpread1() bool TestSpread1()
{ {
/* a input tensor of size (2, 4, 3) */ /* a input tensor of size (4, 4, 3) */
int sOrder = 3; int sOrder = 3;
int * sDimSize = new int[sOrder]; int * sDimSize = new int[sOrder];
sDimSize[0] = 4; sDimSize[0] = 4;
......
...@@ -215,6 +215,305 @@ bool TestSub2() ...@@ -215,6 +215,305 @@ bool TestSub2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: tensor subtraction c = a - b * \beta, which b is a scalar tensor */
bool TestSub3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a scalar */
int bOrder = 0;
int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
int bUnitNum = 1;
/* a tensor of size (2, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[1] = {-1.0F};
DTYPE beta = 2.0F;
DTYPE answer[2][4] = { {2.0F, 3.0F, 4.0F, 5.0F},
{6.0F, 7.0F, 8.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
bool TestSub4()
{
/* a tensor of size (3, 4, 2) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 4;
aDimSize[2] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (3, 4, 2) */
int cOrder = 3;
int * cDimSize = new int[cOrder];
cDimSize[0] = 3;
cDimSize[1] = 4;
cDimSize[2] = 2;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
{ {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
{ {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[3][4][2] = { { {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} },
{ {2.0F, 1.0F}, {-2.0F, -3.0F}, {-6.0F, -7.0F}, {-10.0F, -11.0F} },
{ {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 5: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
bool TestSub5()
{
/* a tensor of size (4, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 4;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (4, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 4;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
{4.0F, 5.0F, 6.0F, 7.0F },
{0.0F, -1.0F, -2.0F, -3.0F },
{-4.0F, -5.0F, -6.0F, -7.0F } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[4][4] = { {2.0F, 1.0F, 0.0F, -1.0F },
{6.0F, 5.0F, 4.0F, 3.0F },
{2.0F, -1.0F, -4.0F, -7.0F },
{-2.0F, -5.0F, -8.0F, -11.0F } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -244,6 +543,33 @@ bool TestSub() ...@@ -244,6 +543,33 @@ bool TestSub()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSub3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSub4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSub5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "../core/utilities/CheckData.h"
#include "../core/arithmetic/SubDim.h"
#include "../XTensor.h"
#include "TSubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
{5.0F, 6.0F, 7.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 0);
_SubDim(cMe, b, 0);
cUser = SubDim(*a, *b, 0);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 0);
_SubDim(cMeGPU, bGPU, 0);
cUserGPU = SubDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
{3.0F, 6.0F, 7.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 1);
_SubDim(cMe, b, 1);
cUser = SubDim(*a, *b, 1);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 1);
_SubDim(cMeGPU, bGPU, 1);
cUserGPU = SubDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for SubDim Function */
bool TestSubDim()
{
XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSubDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSubDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __TEST_SUBDIM_H__
#define __TEST_SUBDIM_H__
#include "../core/arithmetic/SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for SubDim Function */
bool TestSubDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_SUBDIM_H__
...@@ -215,6 +215,305 @@ bool TestSum2() ...@@ -215,6 +215,305 @@ bool TestSum2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: tensor summation c = a + b * \beta, which b is a scalar tensor */
bool TestSum3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a scalar */
int bOrder = 0;
int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
int bUnitNum = 1;
/* a tensor of size (2, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[1] = {-1.0F};
DTYPE beta = 2.0F;
DTYPE answer[2][4] = { {-2.0F, -1.0F, 0.0F, 1.0F},
{2.0F, 3.0F, 4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: tensor summation c = a + b * \beta, which b is a 1d tensor */
bool TestSum4()
{
/* a tensor of size (3, 4, 2) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 4;
aDimSize[2] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (3, 4, 2) */
int cOrder = 3;
int * cDimSize = new int[cOrder];
cDimSize[0] = 3;
cDimSize[1] = 4;
cDimSize[2] = 2;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
{ {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
{ {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[3][4][2] = { { {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} },
{ {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F} },
{ {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 5: tensor summation c = a + b * \beta, which b is a 1d tensor */
bool TestSum5()
{
/* a tensor of size (4, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 4;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (4, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 4;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
{4.0F, 5.0F, 6.0F, 7.0F },
{0.0F, -1.0F, -2.0F, -3.0F },
{-4.0F, -5.0F, -6.0F, -7.0F } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[4][4] = { {-2.0F, 1.0F, 4.0F, 7.0F },
{2.0F, 5.0F, 8.0F, 11.0F },
{-2.0F, -1.0F, 0.0F, 1.0F },
{-6.0F, -5.0F, -4.0F, -3.0F } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
...@@ -244,6 +543,33 @@ bool TestSum() ...@@ -244,6 +543,33 @@ bool TestSum()
else else
XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSum3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSum5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论