Commit 38bff350 by xuchen

merge with liyinqiao brach

parent 509c0233
......@@ -32,10 +32,6 @@
//#include <stdlib.h>
//#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
......
......@@ -200,7 +200,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAll(lossTensor);
float lossBatch = ReduceSumAllValue(lossTensor);
DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
......@@ -345,7 +345,7 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAll(lossTensor);
float lossBatch = ReduceSumAllValue(lossTensor);
/* dump the test result */
for(int s = 0; s < bSize; s++){
......
......@@ -130,6 +130,39 @@ void InitTensor(XTensor * tensor,
}
/*
initialize a scalar V2
>> tensor - the tensor we intend to initialize
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
>> myMem - memory pool used to allocating the data array
myMem = NULL means that the tensor is allocated on
the device dynamically, rather than on the memory pool
*/
void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
{
int dims[MAX_TENSOR_DIM_NUM];
InitTensorV2(tensor, 0, dims, myDataType, 1.0F, myDevID, myMem);
}
/*
initialize a scalar
>> tensor - the tensor we intend to initialize
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
*/
void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
{
int dims[MAX_TENSOR_DIM_NUM];
InitTensor(tensor, 0, dims, myDataType, myDevID, isEnableGrad);
}
/*
initialize a dense tensor V2
>> tensor - the tensor we intend to initialize
>> num - number of elements
......@@ -551,6 +584,37 @@ XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnable
}
/*
generate a scalar V2
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site
>> myMem - memory pool used to allocating the data array
myMem = NULL means that the tensor is allocated on
the device dynamically, rather than on the memory pool.
*/
XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType, const int myDevID, XMem * myMem)
{
int dims[MAX_TENSOR_DIM_NUM];
return NewTensorV2(0, dims, myDataType, 1.0F, myDevID, myMem);
}
/*
generate a scalar
>> myDataType - unit size (e.g., int, float, and double)
>> myDevID - when myMem is NULL, myDevID specifies the device
on which we allocate the data on site.
*/
XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
{
int dims[MAX_TENSOR_DIM_NUM];
return NewTensor(0, dims, myDataType, myDevID, isEnableGrad);
}
/*
generate a dense vector V2
>> num - number of entries
>> myDataType - unit size (e.g., int, float, and double)
......@@ -799,7 +863,7 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
if(a->order > 0)
if(a->order >= 0)
memcpy(dims, a->dimSize, sizeof(int) * a->order);
if(!isFilledData)
......@@ -810,7 +874,6 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
a->devID, a->mem);
return newTensor;
}
/*
......
......@@ -26,6 +26,9 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* default settings */
#define X_ENABLE_GRAD true
/*
* we define the "new and delete" functions below
*/
......@@ -38,7 +41,13 @@ void InitTensorV2(XTensor * tensor,
/* initialize a dense XTensor */
void InitTensor(XTensor * tensor,
const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a scalar V2 */
void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* initialize a scalar */
void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense vector V2 */
void InitTensor1DV2(XTensor * tensor, const int num,
......@@ -46,7 +55,7 @@ void InitTensor1DV2(XTensor * tensor, const int num,
/* initialize a dense vector */
void InitTensor1D(XTensor * tensor, const int num,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense matrix V2 */
void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
......@@ -54,7 +63,7 @@ void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
/* initialize a dense matrix */
void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 3d tensor V2 */
void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
......@@ -62,7 +71,7 @@ void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 3d tensor */
void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 4d tensor V2 */
void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
......@@ -70,7 +79,7 @@ void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 4d tensor */
void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a dense 5d tensor V2 */
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
......@@ -78,7 +87,7 @@ void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2,
/* initialize a dense 5d tensor */
void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* initialize a tensor with a reference tensor V2 */
void InitTensorV2(XTensor * tensor, const XTensor * reference);
......@@ -98,7 +107,7 @@ XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DAT
/* generate a dense XTensor */
XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
......@@ -107,20 +116,26 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
/* generate a dense XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a XTensor which allocates data on the buffer V2 */
XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem);
/* generate a XTensor which allocates data on the buffer */
XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = true);
XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a scalar V2 */
XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
/* generate a scalar */
XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense vector V2 */
XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1,
XMem * myMem = NULL);
/* generate a dense vector */
XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense matrix V2 */
XTensor * NewTensor2DV2(const int rowNum, const int colNum,
......@@ -130,7 +145,7 @@ XTensor * NewTensor2DV2(const int rowNum, const int colNum,
/* generate a dense matrix */
XTensor * NewTensor2D(const int rowNum, const int colNum,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 3d tensor V2 */
XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
......@@ -140,7 +155,7 @@ XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
/* generate a dense 3d tensor */
XTensor * NewTensor3D(const int d0, const int d1, const int d2,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 4d tensor V2 */
XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
......@@ -150,7 +165,7 @@ XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
/* generate a dense 4d tensor */
XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense 5d tensor V2 */
XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
......@@ -160,10 +175,10 @@ XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3,
/* generate a dense 5d tensor */
XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT,
const int myDevID = -1, const bool isEnableGrad = true);
const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a dense vector by range */
XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true);
XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
/* generate a copy of XTensor (with a reference to a given tensor) */
XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
......
......@@ -78,7 +78,7 @@ namespace nts {
if(!(x)) \
{ \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
exit(1); \
throw; \
} \
} \
......@@ -87,7 +87,7 @@ namespace nts {
if(!(x)) \
{ \
fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
exit(1); \
throw; \
} \
} \
......@@ -95,7 +95,7 @@ namespace nts {
{ \
{ \
fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
exit(1); \
throw; \
} \
} \
......
......@@ -167,7 +167,7 @@ void XLink::SetType(int id)
type[0] = 0;
strcpy(type, GetOPName(id));
typeID = id;
if(id != 0){
if (id != 0) {
CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
}
}
......
......@@ -249,26 +249,6 @@ inline int TensorListBase<T>::FindFirst(const T& item)
return -1;
}
template <>
inline int TensorListBase<Example>::FindFirst(const Example& item)
{
for (int i = 0; i < count; i++) {
if (item.id == items[i].id)
return i;
}
return -1;
}
template <>
inline int TensorListBase<Result>::FindFirst(const Result& item)
{
for (int i = 0; i < count; i++) {
if (item.id == items[i].id)
return i;
}
return -1;
}
/* clear the data array */
template <typename T>
void TensorListBase<T>::Clear()
......@@ -383,8 +363,7 @@ template struct TensorListBase<long>;
template struct TensorListBase<float>;
template struct TensorListBase<short>;
template struct TensorListBase<XTensor*>;
template struct TensorListBase<Result>;
template struct TensorListBase<Example>;
template struct TensorListBase<uint64_t>;
template struct TensorListBase<void*>;
} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
......@@ -26,6 +26,8 @@
#include "XMem.h"
#include "XGlobal.h"
#include <cstdint>
#ifndef __TensorList_H__
#define __TensorList_H__
......@@ -118,7 +120,14 @@ public:
void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */
T& operator[] (int i) { return GetItem(i); };
T& operator[] (int i) {
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
if (i < 0)
return items[count + i];
else
return items[i];
};
T& Get(int i) { return GetItem(i); };
void Set(int i, T item) { SetItem(i, item); };
};
......@@ -132,19 +141,7 @@ typedef TensorListBase<char*> StrList;
typedef TensorListBase<long> LongList;
typedef TensorListBase<float> FloatList;
typedef TensorListBase<short> ShortList;
struct Example {
int id;
IntList data;
};
struct Result {
int id;
IntList data;
};
typedef TensorListBase<Result> ResultList;
typedef TensorListBase<Example> ExampleList;
typedef TensorListBase<uint64_t> UInt64List;
typedef TensorListBase<XTensor*> TensorList;
} /* end of the nts (NiuTrans.Tensor) namespace */
......
......@@ -53,6 +53,8 @@ const char * GetOPName(int type)
return "M_TAN";
else if (type == MATH_ROUND)
return "M_ROUND";
else if (type == MATH_RECIPROCAL)
return "M_RECIPROCAL";
else if (type == MATH_CLIP)
return "M_CLIP";
else if (type == MATH_DIV)
......@@ -105,6 +107,8 @@ const char * GetOPName(int type)
return "R_REDUCEMEAN";
else if (type == REDUCE_REDUCESUM)
return "R_REDUCESUM";
else if (type == REDUCE_REDUCESUMALL)
return "R_REDUCESUMALL";
else if (type == REDUCE_REDUCESUMSQUARED)
return "R_REDUCESUMSQUARED";
else if (type == REDUCE_REDUCEVARIANCE)
......@@ -113,6 +117,8 @@ const char * GetOPName(int type)
else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_SELECT)
return "G_SELECT";
else if (type == GETANDSET_CONVERTDATATYPE)
return "G_CONVERTDATATYPE";
else if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES)
......
......@@ -44,8 +44,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1
#define MATH_RECIPROCAL MATH_ROUND + 1
#define MATH_CLIP MATH_ROUND + 1
#define MATH_CLIP MATH_RECIPROCAL + 1
#define MATH_DIV MATH_CLIP + 1
#define MATH_DIVDIM MATH_DIV + 1
#define MATH_MASK MATH_DIVDIM + 1
......@@ -76,7 +77,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
#define REDUCE_REDUCESUMALL REDUCE_REDUCESUM + 1
#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUMALL + 1
#define REDUCE_REDUCEVARIANCE REDUCE_REDUCESUMSQUARED + 1
/* data and shape related operations */
......
......@@ -147,7 +147,11 @@ void XStream::StreamSynchronize()
void XStream::ThreadSynchronize()
{
#ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize();
#else
ShowNTErrors("TODO!");
#endif
#endif
}
......
......@@ -97,7 +97,7 @@ XTensor::XTensor()
}
/* constructor */
XTensor::XTensor(const XTensor * reference)
XTensor::XTensor(const XTensor* reference)
{
Init();
SetDataPointer();
......@@ -112,9 +112,9 @@ constructor
>> myDevID - device id
>> myMem - memory pool used to allocating the data array
*/
XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
XTensor::XTensor(const int myOrder, int myDevID, XMem* myMem)
{
CheckNTErrors((myOrder > 0), "Illegal tensor order1");
CheckNTErrors((myOrder >= 0), "Illegal tensor order!");
Init();
SetDataPointer();
......@@ -150,7 +150,7 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
}
/* copy constructor */
XTensor::XTensor(const XTensor &reference)
XTensor::XTensor(const XTensor& reference)
{
Init();
SetDataPointer();
......@@ -191,7 +191,7 @@ XTensor::XTensor(const XTensor &reference)
}
/* copy constructor (with right value reference) */
XTensor::XTensor(const XTensor &&reference)
XTensor::XTensor(const XTensor&& reference)
{
Init();
SetDataPointer();
......@@ -230,7 +230,7 @@ XTensor::~XTensor()
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag();
newTensor->data = data;
data = NULL;
......@@ -248,7 +248,7 @@ XTensor::~XTensor()
}
/* set the name of the tensor */
void XTensor::SetName(const char * myName)
void XTensor::SetName(const char* myName)
{
strcpy(name, myName);
}
......@@ -280,7 +280,7 @@ void XTensor::Init()
isTmp = false;
isGrad = false;
isVar = false;
enableGrad = true;
enableGrad = X_ENABLE_GRAD;
visitMark = 0;
grad = NULL;
}
......@@ -307,7 +307,7 @@ shallow copy of the tensor
Note that we do not copy data array here
>> tensor - the source tensor
*/
void XTensor::ShallowCopy(const XTensor &tensor)
void XTensor::ShallowCopy(const XTensor& tensor)
{
strcpy(name, tensor.name);
order = tensor.order;
......@@ -335,7 +335,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag();
newTensor->data = data;
newTensor->dataHost = dataHost;
......@@ -367,7 +367,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
else{
/* hard copy of the data array */
int size = unitNum * unitSize;
if( isInit && !isSparse && !tensor.isSparse &&
if(isInit && !isSparse && !tensor.isSparse &&
size == tensor.unitNum * tensor.unitSize &&
((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
data != NULL)
......@@ -412,7 +412,7 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag();
newTensor->data = data;
newTensor->dataHost = dataHost;
......@@ -500,7 +500,7 @@ XTensor XTensor::operator/ (const XTensor& tensor) const
/* overloading of the division-sign */
XTensor XTensor::operator/ (const DTYPE scale) const
{
return ScaleAndShift(*this, (DTYPE)1/scale, 0);
return ScaleAndShift(*this, (DTYPE)1.0F / scale, 0);
}
/*
......@@ -518,20 +518,15 @@ relocate the data on the target device
>> myDevId - target device id
>> myMem - memory pool on the target device
*/
void XTensor::SetDevice(int myDevId, XMem * myMem)
void XTensor::SetDevice(int myDevId, XMem* myMem)
{
if (myMem != NULL) {
FlushToMem(myMem);
isInGlobalMem = false;
}
else {
if(myMem == NULL)
myMem = GMems.GetMem(myDevId);
FlushToMem(myMem);
isInGlobalMem = false;
}
}
bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
{
if(a == NULL || b == NULL)
return false;
......@@ -566,7 +561,7 @@ bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
set the size of each dimension
>> myDimSize - size of each dimension
*/
void XTensor::SetDim(int * myDimSize)
void XTensor::SetDim(int* myDimSize)
{
for (int i = 0; i < order; i++) {
dimSize[i] = myDimSize[i];
......@@ -594,7 +589,7 @@ reshape the tensor
>> myOrder - order of the tensor
>> myDimSize - size of each dimension
*/
void XTensor::Reshape(const int myOrder, const int * myDimSize)
void XTensor::Reshape(const int myOrder, const int* myDimSize)
{
int dims[MAX_TENSOR_DIM_NUM];
int num = 1;
......@@ -664,6 +659,30 @@ XTensor XTensor::TypeAs(const XTensor input)
return ConvertDataType(*this, input.dataType);
}
/* return a tensor that datatype is integer */
XTensor XTensor::Int()
{
return ConvertDataType(*this, X_INT);
}
/* return a tensor that datatype is float */
XTensor XTensor::Float()
{
return ConvertDataType(*this, X_FLOAT);
}
/* return a tensor that datatype is float16 */
XTensor XTensor::Float16()
{
return ConvertDataType(*this, X_FLOAT16);
}
/* return a tensor that datatype is double */
XTensor XTensor::Double()
{
return ConvertDataType(*this, X_DOUBLE);
}
/* get the number of items in the data array */
int XTensor::GetSize() const
{
......@@ -729,7 +748,7 @@ get offset (3D)
*/
MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
{
CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 3!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
......@@ -741,7 +760,7 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
a vector with all entries of 0
>> stream - stream for the job pipeline
*/
void XTensor::SetZeroAll(XStream * stream)
void XTensor::SetZeroAll(XStream* stream)
{
if(data == NULL)
return;
......@@ -793,9 +812,9 @@ void XTensor::SetZeroAll(XStream * stream)
>> num - number of data items
>> beg - where we start the data copy in the data array of the tensor
*/
void XTensor::SetData(const void * d, int num, int beg)
void XTensor::SetData(const void* d, int num, int beg)
{
if (data == NULL || d ==NULL)
if(data == NULL || d == NULL)
return;
CheckNTErrors(!isSparse, "TODO");
......@@ -856,7 +875,7 @@ set tensor items with an array of offsets
>> value - value for the data items
>> num - number of the data items
*/
void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
void XTensor::SetDataBatched(MTYPE* offsets, DTYPE value, int num)
{
_SetDataWithOffset(this, offsets, value, num);
}
......@@ -867,7 +886,7 @@ set tensor items with an array of values
>> values - value for each data item
>> num - number of the data items
*/
void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
void XTensor::SetDataBatchedWithValues(MTYPE* offsets, void* values, int num)
{
_SetDataWithOffsetAndValue(this, offsets, values, num);
}
......@@ -903,7 +922,7 @@ DTYPE XTensor::Get(int offset) const
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
DTYPE * address = (DTYPE*)data + offset;
DTYPE* address = (DTYPE*)data + offset;
return ToCPU(devID, address);
}
......@@ -914,7 +933,7 @@ get the pointer to a cell
>> size - size of index
<< return - pointer to the cell
*/
void * XTensor::GetCell(int index[], int size) const
void* XTensor::GetCell(int index[], int size) const
{
CheckNTErrors((size == order), "Illegal index!");
......@@ -926,7 +945,7 @@ void * XTensor::GetCell(int index[], int size) const
if(isSparse){
DTYPE value;
void * p;
void* p;
if(BinarySearch(offset, value, p))
return (char*)p + sizeof(int);
else
......@@ -938,18 +957,33 @@ void * XTensor::GetCell(int index[], int size) const
}
/*
get the value of a cell in a 0d tensor in default type
<< return - value of cell(i) in float
*/
DTYPE XTensor::Get0D() const
{
CheckNTErrors((order == 0), "Cannot get a 0d cell for a tensor whose order is not 0!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
int dims[1] = {0};
void* value = GetCell(dims, 0);
return ToCPU(devID, value);
}
/*
get the value of a cell in a 1d tensor in default type
>> i - idex
<< return - value of cell(i) in float
*/
DTYPE XTensor::Get1D(int i) const
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((order == 1), "Cannot get a 1d cell for a tensor whose order is not 1!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
int dimSize[1] = {i};
void * value = GetCell(dimSize, 1);
int dims[1] = {i};
void* value = GetCell(dims, 1);
return ToCPU(devID, value);
}
......@@ -968,7 +1002,7 @@ DTYPE XTensor::Get2D(int ni, int mi) const
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
int dims[2] = {ni, mi};
void * value = GetCell(dims, 2);
void* value = GetCell(dims, 2);
return ToCPU(devID, value);
}
......@@ -981,14 +1015,14 @@ get the value of a cell in a 3d tensor
*/
DTYPE XTensor::Get3D(int d0, int d1, int d2) const
{
CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((order == 3), "Cannot get a 3d cell for a tensor whose order is not 3!");
CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((d1 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
int dims[3] = {d0, d1, d2};
void * value = GetCell(dims, 3);
void* value = GetCell(dims, 3);
return ToCPU(devID, value);
}
......@@ -1004,24 +1038,39 @@ int XTensor::GetInt(int offset) const
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
int * address = (int*)data + offset;
int* address = (int*)data + offset;
return ToCPUInt(devID, address);
}
/*
get the value of a cell in a 0d tensor in int type
<< return - value of cell(i) in int
*/
int XTensor::Get0DInt() const
{
CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");
int dims[1] = {0};
void* value = GetCell(dims, 0);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in a 1d tensor in int type
>> i - index
<< return - value of cell(i) in int
*/
int XTensor::Get1DInt(int i) const
{
CheckNTErrors(order == 1, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 1, "Cannot get a 1d cell for a tensor whose order is not 1!");
CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");
int dimSize[1] = {i};
void * value = GetCell(dimSize, 1);
int dims[1] = {i};
void* value = GetCell(dims, 1);
return ToCPUInt(devID, value);
}
......@@ -1032,7 +1081,7 @@ get the value of a cell in a 2d tensor in int type
>> mi - column index
<< return - value of cell(ni, mi) in int
*/
int XTensor::Get2DInt(int ni, int mi) const
int XTensor::Get2DInt(int ni, int mi) const
{
CheckNTErrors(order == 2, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(ni >= 0 && ni < dimSize[0], "dimension 0 is out of range!");
......@@ -1040,7 +1089,7 @@ get the value of a cell in a 2d tensor in int type
CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");
int dims[2] = {ni, mi};
void * value = GetCell(dims, 2);
void* value = GetCell(dims, 2);
return ToCPUInt(devID, value);
}
......@@ -1054,14 +1103,14 @@ get the value of a cell in a 3d tensor in int type
*/
int XTensor::Get3DInt(int d0, int d1, int d2) const
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 3!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");
int dims[3] = {d0, d1, d2};
void * value = GetCell(dims, 3);
void* value = GetCell(dims, 3);
return ToCPUInt(devID, value);
}
......@@ -1076,8 +1125,8 @@ DTYPE XTensor::GetInSparse(int i) const
CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
char * d = (char*)data + sizeof(int);
DTYPE * value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));
char* d = (char*)data + sizeof(int);
DTYPE* value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));
return ToCPU(devID, value);
}
......@@ -1092,8 +1141,8 @@ int XTensor::GetKeyInSparse(int i) const
CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
char * d = (char*)data + sizeof(int);
int * key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
char* d = (char*)data + sizeof(int);
int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
return ToCPUInt(devID, key);
}
......@@ -1121,12 +1170,27 @@ bool XTensor::Set(DTYPE value, int offset)
CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
DTYPE * d = (DTYPE*)data + offset;
DTYPE* d = (DTYPE*)data + offset;
return SetToDevice(devID, d, value);
}
/*
set the value of a cell in a 0d tensor
>> value - value we tend to set
<< return - succeeded or not
*/
bool XTensor::Set0D(DTYPE value)
{
CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
int dims[1] = {0};
return SetToDevice(devID, GetCell(dims, 0), value);
}
/*
set the value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
......@@ -1134,7 +1198,7 @@ set the value of a cell in a 1d tensor
*/
bool XTensor::Set1D(DTYPE value, int i)
{
CheckNTErrors(order == 1, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 1, "Cannot get a 1d cell for a tensor whose order is not 1!");
CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
......@@ -1172,7 +1236,7 @@ set the value of a cell in a 3d tensor in default type
*/
bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 3!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
......@@ -1193,7 +1257,7 @@ bool XTensor::SetInt(int value, int offset)
CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
int * d = (int*)data + offset;
int* d = (int*)data + offset;
return SetToDeviceInt(devID, d, value);
}
......@@ -1216,12 +1280,27 @@ bool XTensor::SetInt(int value, int index[], int size)
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
<< return - succeeded or not
*/
bool XTensor::Set0DInt(int value)
{
CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");
int dims[1] = {0};
return SetToDeviceInt(devID, GetCell(dims, 0), value);
}
/*
set the integer value of a cell in a 1d tensor
>> value - value we tend to set
>> i - item offset
<< return - succeeded or not
*/
bool XTensor::Set1DInt(int value, int i)
{
CheckNTErrors(order == 1, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 1, "Cannot get a 1d cell for a tensor whose order is not 1!");
CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");
......@@ -1259,7 +1338,7 @@ set the integer value of a cell in a 3d tensor in default type
*/
bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
{
CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 3!");
CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
......@@ -1277,7 +1356,7 @@ increase the value of a cell in a 2d tensor
>> mi - column index
<< return - succeeded or not
*/
bool XTensor::Add2D(DTYPE value, int ni, int mi)
bool XTensor::Add2D(DTYPE value, int ni, int mi)
{
CheckNTErrors(ni >= 0 && ni < dimSize[0], "the row index is out of range!");
CheckNTErrors(mi >= 0 && mi < dimSize[1], "the column index is out of range!");
......@@ -1285,7 +1364,7 @@ increase the value of a cell in a 2d tensor
CheckNTErrors(isSparse == false, "TODO!");
if(devID < 0){
DTYPE * p = (DTYPE*)data + ni * dimSize[1] + mi;
DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi;
CheckNTErrors((p != NULL), "No data array is found!");
......@@ -1362,7 +1441,7 @@ resize a tensor with a specified tensor size
>> myDenseRatio - how often an element has non-zero value
<< return - succeeded or not
*/
bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool XTensor::Resize(const int myOrder, const int* myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{
/* free old mem */
......@@ -1426,11 +1505,11 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
*/
int num = int(unitNum * denseRatio + 1);
int tupleSize = sizeof(int)+sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num);
int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize * (num);
if(filledData){
int * d = NULL;
int* d = NULL;
if(mem == NULL){
d = new int[size];
......@@ -1478,7 +1557,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
resize a tensor by another
>> myTensor - tensor for reference
*/
bool XTensor::Resize(const XTensor * myTensor)
bool XTensor::Resize(const XTensor* myTensor)
{
denseRatio = myTensor->denseRatio;
TENSOR_DATA_TYPE myDataType = myTensor->dataType;
......@@ -1499,12 +1578,12 @@ binary search to find an element in a sparse tensor
it is the previous one if there is no hit
<< return - found it or not?
*/
bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
{
CheckNTErrors((isSparse), "A sparse tensor is required!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");
int * d = (int*)data;
int* d = (int*)data;
if(key < 0 || *d == 0){
value = 0;
......@@ -1516,30 +1595,30 @@ bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
int high = *d - 1;
int last = -1;
bool ok = false;
int * k = NULL;
int* k = NULL;
int headSize = sizeof(int);
int tupleSize = sizeof(int)+sizeof(DTYPE);
char * p = (char*)data + headSize;
int tupleSize = sizeof(int) + sizeof(DTYPE);
char* p = (char*)data + headSize;
while (low <= high){
int mid = low + (high-low)/2;
int mid = low + (high - low)/2;
k = (int*)(p + tupleSize * mid);
if (*k == key){
if(*k == key){
ok = true;
high = mid -1;
high = mid - 1;
break;
}
else if(*k > key){
high = mid -1;
high = mid - 1;
}
else{
low = mid +1;
low = mid + 1;
last = mid;
}
}
if(ok){
DTYPE * p = (DTYPE*)((char*)k + sizeof(int));
DTYPE* p = (DTYPE*)((char*)k + sizeof(int));
value = *p;
position = k;
return true;
......@@ -1562,12 +1641,12 @@ dump data to a file
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, const int verbose)
{
if (verbose > verboseLevel)
return;
void * d = data;
void* d = data;
bool isNewData = false;
#ifdef USE_CUDA
......@@ -1585,7 +1664,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
num *= dimSize[i];
num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num);
int size = sizeof(int) + tupleSize * (num);
d = new char[size];
memset(d, 0, size);
......@@ -1602,6 +1681,9 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if(isInit){
fprintf(file, "order=%d dimsize=", order);
if(order == 0) {
fprintf(file, "%d,", dimSize[0]);
}
for (int i = 0; i < order; i++) {
fprintf(file, "%d", dimSize[i]);
if (i < order - 1)
......@@ -1618,8 +1700,8 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
fprintf(file, "NULL");
}
if (!isSparse) {
if (dataType == DEFAULT_DTYPE) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
if (dataType == X_FLOAT) {
for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i];
if(i == beg)
......@@ -1630,7 +1712,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
}
}
else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
int f = ((int*)d)[i];
if(i == beg)
......@@ -1639,6 +1720,16 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
fprintf(file, " %d", f);
}
}
else if (dataType == X_FLOAT16) {
float16* f = (float16*)d;
for (int i = beg; i < end; i++) {
float v = f[i].Float();
if (i == beg)
fprintf(file, "%e", v);
else
fprintf(file, " %e", v);
}
}
else
ShowNTErrors("TODO!");
}
......@@ -1673,7 +1764,7 @@ dump data to a file
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
void XTensor::Dump(const XTensor* tensor, FILE* file, const char* label, const int n, const int beg, const int verbose)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
......@@ -1705,7 +1796,7 @@ read data from a file
>> file - where to load the data
>> label - label of the tensor
*/
void XTensor::Read(FILE * file, const char * label)
void XTensor::Read(FILE* file, const char* label)
{
char typeName[32] = "";
char dimSizeName[128] = "";
......@@ -1738,7 +1829,7 @@ void XTensor::Read(FILE * file, const char * label)
int o = 0;
bool sameSize = true;
char * p = dimSizeName;
char* p = dimSizeName;
while (*p != 0) {
while (*p == ' ' || *p == '\t')
p++;
......@@ -1762,14 +1853,14 @@ void XTensor::Read(FILE * file, const char * label)
if (!sameSize || dRatio > denseRatio || GetDataType(typeName) != dataType)
Resize(dimNum, dims, GetDataType(typeName), dRatio);
void * dataBuf = XMemAlloc(-1, GetDataSizeInChar());
void * dataBackup = data;
void* dataBuf = XMemAlloc(-1, GetDataSizeInChar());
void* dataBackup = data;
data = dataBuf;
if (!isSparse) {
if (dataType == DEFAULT_DTYPE) {
for (int i = 0; i < unitNum; i++) {
DTYPE * f = ((DTYPE*)data) + i;
DTYPE* f = ((DTYPE*)data) + i;
if (fscanf(file, "%e", f) < 1) {
ShowNTErrors("Incorrect tensor format!");
}
......@@ -1822,13 +1913,13 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
fseek(file, offset, 0);
switch (dataType) {
case X_INT: {
int * d = new int[unitNum];
int* d = new int[unitNum];
fread(d, sizeof(int), unitNum, file);
SetData(d, unitNum);
delete[] d;
}
default: {
float * d = new float[unitNum];
float* d = new float[unitNum];
fread(d, sizeof(float), unitNum, file);
SetData(d, unitNum);
delete[] d;
......@@ -1840,7 +1931,7 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
flush the data to the target device
>> targetMem - memory pool on the target device
*/
void XTensor::FlushToMem(XMem * targetMem)
void XTensor::FlushToMem(XMem* targetMem)
{
if (targetMem == NULL)
return;
......@@ -1853,8 +1944,9 @@ void XTensor::FlushToMem(XMem * targetMem)
CudaCPUToGPUFlush(&l, targetMem->devID, targetMem);
}
else if (mem != targetMem) {
void * tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
void* tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
XMemCopy(tmpData, targetMem->devID, data, devID, GetDataSizeInChar());
mem->Release(data, GetDataSizeInChar(), signature);
data = tmpData;
mem = targetMem;
devID = mem->devID;
......@@ -1866,13 +1958,22 @@ void XTensor::FlushToMem(XMem * targetMem)
else {
if (devID >= 0) {
#ifdef USE_CUDA
CudaGPUToCPUFlush(this);
mem = targetMem;
devID = mem->devID;
CudaGPUToCPUFlush(this, targetMem->devID, targetMem);
#else
ShowNTErrors("Recompile the code with USE_CUDA!");
#endif
}
else if (mem != targetMem) {
void* tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
XMemCopy(tmpData, targetMem->devID, data, devID, GetDataSizeInChar());
if (mem != NULL)
mem->Release(data, GetDataSizeInChar(), signature);
else
XMemFree(devID, data);
data = tmpData;
mem = targetMem;
devID = mem->devID;
}
}
}
......@@ -1882,7 +1983,7 @@ allocate the memory space of the tensor (in the global memory)
>> myMem - the memory pool we are using
>> useBuf - indicates whether we use the buffer in the memory pool
*/
void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf)
{
if(tensor == NULL)
return;
......@@ -1914,7 +2015,7 @@ free the memory space of the tensor (in the global memory)
>> myMem - the memory pool we are using
>> useBuf - indicates whether we use the buffer in the memory pool
*/
void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf)
{
if(tensor == NULL)
return;
......@@ -1934,27 +2035,27 @@ void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
}
/* overloading of the plus-sign */
XTensor operator+ (const DTYPE shift, const XTensor &tensor)
XTensor operator+ (const DTYPE shift, const XTensor& tensor)
{
return ScaleAndShift(tensor, 1, shift);
}
/* overloading of the minus-sign */
XTensor operator- (const DTYPE shift, const XTensor &tensor)
XTensor operator- (const DTYPE shift, const XTensor& tensor)
{
return ScaleAndShift(tensor, 1, -shift);
}
/* overloading of the multiply-sign */
XTensor operator* (const DTYPE scale, const XTensor &tensor)
XTensor operator* (const DTYPE scale, const XTensor& tensor)
{
return ScaleAndShift(tensor, scale, 0);
}
/* overloading of the division-sign */
XTensor operator/ (const DTYPE scale, const XTensor &tensor)
XTensor operator/ (const DTYPE scale, const XTensor& tensor)
{
return ScaleAndShift(tensor, (DTYPE)1/scale, 0);
return ScaleAndShift(tensor, (DTYPE)1.0F / scale, 0);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -29,7 +29,6 @@
#define __XTENSOR_H__
#include "XGlobal.h"
#include "XMem.h"
#include "XPRunner.h"
#include "XStream.h"
#include "XHeap.h"
......@@ -276,6 +275,18 @@ public:
/* return a tensor that datatype is same as the special tensor */
XTensor TypeAs(const XTensor input);
/* return a tensor that datatype is integer */
XTensor Int();
/* return a tensor that datatype is float */
XTensor Float();
/* return a tensor that datatype is float16 */
XTensor Float16();
/* return a tensor that datatype is double */
XTensor Double();
/* get the number of items in the data array */
int GetSize() const;
......@@ -331,6 +342,9 @@ public:
/* get the pointer to a cell */
void * GetCell(int index[], int size = -1) const;
/* get the default type value of a cell in a 0d tensor */
DTYPE Get0D() const;
/* get the default type value of a cell in a 1d tensor */
DTYPE Get1D(int i) const;
......@@ -343,6 +357,9 @@ public:
/* get the int value of a cell by its offset */
int GetInt(int offset) const;
/* get the int value of a cell in a 0d tensor */
int Get0DInt() const;
/* get the int value of a cell in a 1d tensor */
int Get1DInt(int i) const;
......@@ -364,6 +381,9 @@ public:
/* set the value of a cell with its offset in the array */
bool Set(DTYPE value, int offset);
/* set the value of a cell in a 0d tensor */
bool Set0D(DTYPE value);
/* set the value of a cell in a 1d tensor */
bool Set1D(DTYPE value, int i);
......@@ -379,6 +399,9 @@ public:
/* set the integer value of a cell */
bool SetInt(int value, int index[], int size = -1);
/* set the integer value of a cell in a 0d tensor */
bool Set0DInt(int value);
/* set the integer value of a cell in a 1d tensor */
bool Set1DInt(int value, int i);
......
......@@ -37,7 +37,6 @@
#include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h"
#include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumDim.h"
#include "arithmetic/XTensorBLAS.h"
......
......@@ -23,6 +23,8 @@
#include "../../XName.h"
#include "../../XUtility.h"
#include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "../math/ScaleAndShift.h"
#include "Div.h"
#include "Div.cuh"
#include "DivDim.h"
......@@ -127,7 +129,7 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
element-wise division of two tensors (do it on site)
keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i)
a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the item
>> a - tensor a (where keep the result)
......@@ -144,7 +146,7 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
element-wise division of two tensors (do it on site)
keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i)
a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the item
>> a - tensor a (where keep the result)
......@@ -152,45 +154,35 @@ where i is the index of the item
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
*/
void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
void DivMe(XTensor & a, const XTensor & b, DTYPE alpha, int leadingDim)
{
_Div(&a, &b, &a, alpha, leadingDim);
}
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D() + alpha;
/*
return a dimension if the division is performed as DivDim (in more details in DivDim.h)
>> a - a tensor
>> b - another tensor for division
*/
int GetDivDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
_ScaleAndShift(&a, &a, scale, 0.0F);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(hitCount == 1)
return hitDim;
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */
_Div(&a, &b, &a, alpha, leadingDim);
}
else if (n >= 0 && n < a.order)
/* call _DivDim function */
_DivDim(&a, &b, &a, n, alpha);
else
return -1;
ShowNTErrors("Something is wrong!");
}
}
/*
element-wise division of two tensors (return an XTensor structure)
make a new tensor c to keep the result and return it
c(i) = a(i)*b(i)
c(i) = a(i)/b(i)
where i is the index of the item
>> a - tensor a
......@@ -199,12 +191,18 @@ where i is the index of the item
>> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors
*/
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
XTensor Div(const XTensor & a, const XTensor & b, int leadingDim)
{
XTensor c(&a);
c.SetTMPFlag();
int n = GetDivDimIndex(a, b);
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
ScaleAndShift(a, c, scale, 0.0F);
}
else {
DTYPE alpha = 0.0F;
int n = GetBroadcastDimIndex(a, b);
if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
......@@ -215,8 +213,6 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if(n >= 0 && n < a.order){
......@@ -227,12 +223,12 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
return c;
}
......@@ -249,25 +245,36 @@ where i is the index of the item
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
*/
void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int leadingDim)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
int n = GetDivDimIndex(a, b);
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */
_Div(&a, &b, &c, 0, leadingDim);
_Div(&a, &b, &c, alpha, leadingDim);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if (n >= 0 && n < a.order) {
......@@ -278,13 +285,12 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
c(i) = a(i)/b(i)
where i is the index of the element
*/
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
/*
element-wise division of two tensors:
......
......@@ -22,8 +22,8 @@
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -39,7 +39,7 @@ c = a - b * \beta
>> alpha - value
*/
__global__
void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......
......@@ -159,6 +159,10 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"The code must be run on the same GPU!");
int devIDBackup;
if (beta == 0)
c->SetZeroAll();
ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
......
......@@ -156,6 +156,9 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
if (beta == 0)
c->SetZeroAll();
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
......
......@@ -54,6 +54,9 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int aColNum = am;
int bColNum = bm;
if (beta == 0)
c->SetZeroAll();
/* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
......
......@@ -118,6 +118,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSize[i];
}
if (beta == 0)
c->SetZeroAll();
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
......
......@@ -27,36 +27,6 @@
#include "Sum.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor
>> b - another tensor for sum
*/
int GetSumIndex(const XTensor &a, const XTensor &b)
{
if (a.order < b.order)
return -1;
if (IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for (int i = 0; i < b.order; i++) {
if (b.dimSize[b.order - 1 - i] == 1)
continue;
else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
hitCount++;
hitDim = a.order - b.order + i;
}
}
if (hitCount == 1)
return hitDim;
else
return -1;
}
/*
operation c = x * w + b MulAndShift
>> x - tensor x
......@@ -99,7 +69,10 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
XTensor c(tmp);
c.SetTMPFlag();
int n = GetSumIndex(tmp, b);
if (b.order == 0)
ScaleAndShift(*tmp, c, 1.0F, b.Get0D());
else {
int n = GetBroadcastDimIndex(tmp, b);
if (n == -1) {
/* call _Sum function */
......@@ -107,23 +80,22 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
// TODO!!
ShowNTErrors("TODO!");
}
else if (n >= 0 && n < tmp->order) {
/* call _SumDim function */
_SumDim(tmp, &b, &c, n);
}
else {
ShowNTErrors("Something is wrong!");
}
/* tensor connections */
if (w.enableGrad && b.enableGrad) {
XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
}
}
/* destroy variables */
......@@ -174,7 +146,7 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
XTensor c(tmp);
c.SetTMPFlag();
int n = GetSumIndex(tmp, b);
int n = GetBroadcastDimIndex(tmp, b);
if (n == -1) {
/* call _Sum function */
......
......@@ -23,6 +23,8 @@
#include "../../XName.h"
#include "../../XUtility.h"
#include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "../math/ScaleAndShift.h"
#include "Multiply.h"
#include "Multiply.cuh"
#include "MultiplyDim.h"
......@@ -155,36 +157,28 @@ where i is the index of the item
*/
void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
{
_Multiply(&a, &b, &a, alpha, leadingDim);
}
if (b.order == 0){
DTYPE scale = b.Get0D() + alpha;
/*
return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
>> a - a tensor
>> b - another tensor for multiplication
*/
int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
_ScaleAndShift(&a, &a, scale, 0.0F);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */
_Multiply(&a, &b, &a, alpha, leadingDim);
}
else if (n >= 0 && n < a.order) {
/* call _MultiplyDim function */
_MultiplyDim(&a, &b, &a, n, alpha);
}
else {
ShowNTErrors("Something is wrong!");
}
}
}
/*
......@@ -199,25 +193,28 @@ where i is the index of the item
>> leadingDim - the dimension along which we perform broadcasting
<< return - the product of the tensors
*/
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
{
XTensor c(&a);
c.SetTMPFlag();
int n = GetMultiplyDimIndex(a, b);
if (b.order == 0){
DTYPE scale = b.Get0D();
ScaleAndShift(a, c, scale, 0.0F);
}
else {
DTYPE alpha = 0.0F;
int n = GetBroadcastDimIndex(a, b);
if(n == -1){
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim);
_Multiply(&a, &b, &c, alpha, leadingDim);
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if(n >= 0 && n < a.order){
......@@ -228,12 +225,12 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
return c;
}
......@@ -256,19 +253,30 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
InitTensorV2(&c, &a);
}
int n = GetMultiplyDimIndex(a, b);
if (b.order == 0){
DTYPE scale = b.Get0D();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim);
_Multiply(&a, &b, &c, alpha, leadingDim);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if (n >= 0 && n < a.order) {
......@@ -279,13 +287,12 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -48,7 +48,7 @@ make a new tensor to keep the result and return it
c(i) = a(i)*b(i)
where i is the index of the element
*/
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
/*
element-wise product of two tensors:
......
......@@ -233,7 +233,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
CheckNTErrors(a->order >= 0, "TODO!");
int order = a->order;
int count = 0;
......
......@@ -16,16 +16,16 @@
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
* $Created by: Li Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2020-02-11
* Paper review rebuttal of ACL2020 will start at this Thursday. So nervous :(
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../shape/IsSameShaped.h"
#include "Sum.h"
#include "SumDim.h"
#include "../math/ScaleAndShift.h"
#include "Sub.h"
#include "Sub.cuh"
#include "SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -39,80 +39,7 @@ tensor subtraction c = a - b * \beta
*/
void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
"Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
CheckDev(a->devID, b->devID);
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
if (a == c) {
int P2PAccesible = 0;
#ifdef CUDA_UVA
cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
#endif
if ((a->devID < 0 && b->devID >= 0) ||
(a->devID >= 0 && b->devID < 0) ||
(a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
{
ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
}
else
_CudaSub(a, b, c, beta);
}
else
_CudaSub(a, b, c, beta);
#endif
}
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data;
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] - bp[i] * beta;
cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] - bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] - bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] - bp[i] * beta;
cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] - bp[i] * beta;
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
_Sum(a, b, c, -beta);
}
/*
......@@ -136,38 +63,24 @@ keep the result in the tensor a and return nothing
>> b - another tensor
>> beta - the scaling factor
*/
void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
{
_Sub(&a, &b, &a, beta);
}
/*
return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
>> a - a tensor
>> b - another tensor for subtraction
*/
int GetSubDimIndex(const XTensor &a, const XTensor &b)
void SubMe(XTensor & a, const XTensor & b, DTYPE beta)
{
if(a.order < b.order)
return -1;
if(IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
}
if (b.order == 0){
DTYPE shift = -(b.Get0D() * beta);
_ScaleAndShift(&a, &a, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(hitCount == 1)
return hitDim;
if (n == -1)
/* call _Sub function */
_Sub(&a, &b, &a, beta);
else if (n >= 0 && n < a.order)
/* call _SumDim function to do the SubDim operation */
_SumDim(&a, &b, &a, n, -beta);
else
return -1;
ShowNTErrors("Something is wrong!");
}
}
/*
......@@ -179,12 +92,17 @@ make a new tensor c to keep the result and return it
>> beta - the scaling factor
<< return - the result of tensor subtraction
*/
XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
XTensor Sub(const XTensor & a, const XTensor & b, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
int n = GetSubDimIndex(a, b);
if (b.order == 0){
DTYPE shift = -(b.Get0D() * beta);
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(n == -1){
/* call _Sub function */
......@@ -197,8 +115,8 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
}
}
else if(n >= 0 && n < a.order){
/* call _SubDim function */
_SubDim(&a, &b, &c, n, beta);
/* call _SumDim function to do the SubDim operation */
_SumDim(&a, &b, &c, n, -beta);
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
......@@ -210,7 +128,7 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
else{
ShowNTErrors("Something is wrong!");
}
}
return c;
}
......@@ -222,13 +140,18 @@ tensor subtraction c = a - b * \beta
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> beta - the scaling factor
*/
void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
void Sub(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
int n = GetSubDimIndex(a, b);
if (b.order == 0){
DTYPE shift = -(b.Get0D() * beta);
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) {
/* call _Sub function */
......@@ -241,8 +164,8 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
}
}
else if (n >= 0 && n < a.order) {
/* call _SubDim function */
_SubDim(&a, &b, &c, n, beta);
/* call _SumDim function to do the SubDim operation */
_SumDim(&a, &b, &c, n, -beta);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
......@@ -254,6 +177,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
else {
ShowNTErrors("Something is wrong!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
subtraction of data arrays (CUDA Kernel)
c = a - b * \beta
>> a - A matrix
>> b - another matrix
>> c - where we put a-b
>> size - the size of a/b/c
>> beta - the coefficient
*/
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] - b[i] * beta;
}
/*
tensor subtraction c = a - b * \beta (cuda version)
>> a - a tensor
>> b - another tensor
>> c - where we put a-b*\beta.
>> beta - the scaling factor
*/
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
"Unmatched tensors in addition!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Unmatched tensors in addition!");
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The tensors must be on the same!");
int devIDBackup = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(a->devID);
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
XDevice::SetGPUDevice(devIDBackup);
}
/* subtraction over arrays
tensor subtraction c = a - b * \beta (cuda version) with an input handle
>> devID - device ID (MUST >= 0)
>> handle - cuda handle
>> a - an array
>> b - another array
>> c - where we put a-b
>> size - size of the array
>> beta - the coefficient
*/
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
if (size == 0)
return;
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty arrays in addition!");
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if (c == a) {
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
#else
cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
#endif
}
else {
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(devID, size, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#ifndef __SUB_CUH__
#define __SUB_CUH__
#include "Sub.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* subtraction of data arrays (CUDA Kernel) */
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) */
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) with an input handle */
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SUB_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include <math.h>
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../shape/IsSameShaped.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if (beta == 0) {
_CopyValues(a, c);
return;
}
if (_IsSameShaped(a, b)) {
_Sub(a, b, c, beta);
return;
}
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSubDim(a, b, c, n, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE) {
int num = a->unitNum;
if (stride > 1) {
for (int i = 0, j = 0; i < num; i += stride, j++) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize) * beta;
DTYPE * cp = (DTYPE*)c->data + i;
for (int k = 0; k < stride; k++)
cp[k] = ap[k] - bv;
}
}
else if (stride == 1) {
DTYPE * bp = (DTYPE*)b->data;
for (int i = 0; i < num; i += blockSize) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if (beta == 1.0F) {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j];
}
else {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j] * beta;
}
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor subtraction (do it on site)
keep the result in the input tensor and return nothing
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
{
_SubDim(a, b, a, n, beta);
}
/*
tensor subtraction (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
<< return - the result tensor by tensor subtraction
*/
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
return c;
}
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "SubDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor subtraction of a tensor and a row vector
c = a - b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
/*
tensor subtraction (cuda version)
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_CUH__
#define __SUBDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting (cuda version) */
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_H__
#define __SUBDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__
......@@ -25,6 +25,7 @@
#include "../../XBLAS.h"
#include "../movement/CopyValues.h"
#include "../shape/IsSameShaped.h"
#include "../math/ScaleAndShift.h"
#include "Sum.h"
#include "Sum.cuh"
#include "SumDim.h"
......@@ -93,7 +94,38 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
AXPY(a->unitNum, beta, bp, 1, cp, 1);
return;
}
#else
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
}
}
#endif
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int * ap = (int*)a->data;
int * bp = (int*)b->data;
int * cp = (int*)c->data;
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
......@@ -149,38 +181,58 @@ keep the result in the tensor a and return nothing
>> b - another tensor
>> beta - the scaling factor
*/
void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
void SumMe(XTensor & a, const XTensor & b, DTYPE beta)
{
if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
_ScaleAndShift(&a, &a, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1)
/* call _Sum function */
_Sum(&a, &b, &a, beta);
else if (n >= 0 && n < a.order)
/* call _SumDim function */
_SumDim(&a, &b, &a, n, beta);
else
ShowNTErrors("Something is wrong!");
}
}
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
return a dimension if the operation is performed as broadcast(e.g. SumDim function)
>> a - a tensor
>> b - another tensor for sum
>> b - another tensor for operation
*/
int GetSumDimIndex(const XTensor &a, const XTensor &b)
int GetBroadcastDimIndex(const XTensor & a, const XTensor & b)
{
if(a.order < b.order)
return -1;
if(IsSameShaped(a, b))
return -1;
int hitCount = 0;
int hitDim = -1;
bool isHit = false;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++;
hitDim = a.order - b.order + i;
else {
if (isHit == true)
return -1;
else
isHit = true;
for (int j = 0; j < a.order; j++){
if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - j]){
hitDim = a.order - 1 - j;
break;
}
}
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
}
/*
......@@ -192,12 +244,17 @@ make a new tensor c to keep the result and return it
>> beta - the scaling factor
<< return - the result of tensor summation
*/
XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
XTensor Sum(const XTensor & a, const XTensor & b, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
int n = GetSumDimIndex(a, b);
if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if(n == -1){
/* call _Sum function */
......@@ -223,7 +280,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else{
ShowNTErrors("Something is wrong!");
}
}
return c;
}
......@@ -234,13 +291,18 @@ tensor summation c = a + b * \beta
>> b - another tensor
>> beta - the scaling factor
*/
void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
void Sum(const XTensor & a, const XTensor & b, XTensor & c, DTYPE beta)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
int n = GetSumDimIndex(a, b);
if (b.order == 0){
DTYPE shift = b.Get0D() * beta;
ScaleAndShift(a, c, 1.0F, shift);
}
else {
int n = GetBroadcastDimIndex(a, b);
if (n == -1) {
/* call _Sum function */
......@@ -266,6 +328,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
else {
ShowNTErrors("Something is wrong!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -45,6 +45,15 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
c[i] = a[i] + b[i] * beta;
}
__global__
void KernelADD(int * a, int * b, int * c, int size, int beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * beta;
}
/*
tensor summation c = a + b * \beta (cuda version)
>> a - a tensor
......@@ -100,6 +109,17 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelADD << <blocks, threads >> >((int*)a->data, (int*)b->data, (int*)c->data, a->unitNum, (int)beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
......
......@@ -26,6 +26,9 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* return a dimension if the operation is performed as broadcast(e.g. SumDim function) */
int GetBroadcastDimIndex(const XTensor & a, const XTensor & b);
/* tensor summation c = a + b * \beta */
void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
......
......@@ -220,7 +220,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
CheckNTErrors(a->order >= 0, "TODO!");
int order = a->order;
int count = 0;
......
......@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
#endif
CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
if (a->dataType == DEFAULT_DTYPE) {
DTYPE* d = (DTYPE*)a->data;
DTYPE* db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
......@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
else
db[i] = d[i];
}
}
else if (a->dataType == X_INT) {
int* d = (int*)a->data;
int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
}
}
else
ShowNTErrors("TODO!");
}
/*
......
......@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border
>> size - size of the data array
*/
template <class T>
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
void KernelClip(T * a, T * b, T lower, T upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -90,10 +91,16 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
KernelClip<DTYPE> << <blocks, threads >> >((DTYPE *)a->data, (DTYPE *)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip<int> << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
ShowNTErrors("TODO!");
}
else {
ShowNTErrors("TODO!");
......
......@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
template <class T> __global__
void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
......
......@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
return;
}
#endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
if (a->dataType == DEFAULT_DTYPE) {
/* sparse tensor */
if(a->isSparse){
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
......@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
}
}
/* dense tensor */
else{
else {
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
for(int i = 0; i < b->unitNum; i++){
......@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++;
}
}
}
else if (a->dataType == X_INT) {
/* sparse tensor */
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
int * v = (int*)f;
int * vb = (int*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int);
}
}
/* dense tensor */
else {
int * va = (int*)a->data;
int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
else
ShowNTErrors("TODO!");
}
/*
......
......@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
template<bool isUnitScale, bool isZeroShift>
template<class T, bool isUnitScale, bool isZeroShift>
__global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale);
......
......@@ -146,7 +146,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
}
else {
CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
CheckNTErrors(t->GetDim(i) >= indexSize * copyNum, "Unmatched dimensions");
}
}
......
......@@ -43,12 +43,43 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
CheckNTErrors((srcIndex->dataType == X_INT), "The index tensor should be INT type!");
CheckNTErrors((srcIndex->order == s->order), "index's order should be the same with source's");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim);
return;
}
#endif
int stride = 1;
int blockNum = 1;
for (int i = dim + 1; i < s->order; ++i)
{
stride *= s->GetDim(i);
}
for (int i = 0; i < dim; ++i)
{
blockNum *= s->GetDim(i);
}
int indexStrideNum = srcIndex->GetDim(dim);
int srcStrideNum = stride * s->GetDim(dim);
int tgtBlockSize = stride * indexStrideNum;
DTYPE * sData = (DTYPE*)s->data;
DTYPE * tData = (DTYPE*)t->data;
int * sIndexData = (int*)srcIndex->data;
for (int blockIndex = 0; blockIndex < blockNum; ++blockIndex)
{
for (int i = 0; i < indexStrideNum; i++) {
for (int j = 0; j < stride; j++)
{
int sIndex = sIndexData[i * stride + blockIndex * indexStrideNum + j] * stride + blockIndex * srcStrideNum + j;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
int tIndex = i * stride + blockIndex * tgtBlockSize + j;
tData[tIndex] = sData[sIndex];
}
}
}
}
/*
......@@ -64,13 +95,14 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
if (s->devID >= 0) {
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex);
return;
}
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int indexSize = 1;
......@@ -83,9 +115,11 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j];
}
}
}
/*
......
......@@ -77,7 +77,7 @@ gather indexed sub-tensors(cuda version)
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum, int srcStrideNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -90,7 +90,7 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int st
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
tData[i] = sData[sIndex[i] * stride + stride * srcStrideNum * blockIndex + offsetInBlock];
}
}
......@@ -126,14 +126,30 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
int * sIndex = NULL;
if (srcIndex->devID < 0) {
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
else {
int * sIndexData = new int[sizeof(int) * indexSize];
XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = (int *)srcIndex->data;
delete[] sIndexData;
}
KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
if (srcIndex->devID < 0) {
......@@ -163,6 +179,7 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int blockNum = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
int srcStrideNum = s->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
......@@ -170,19 +187,33 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int * sIndex = NULL;
if (srcIndex->devID < 0) {
int * sIndexData = (int*)srcIndex->data;
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
else {
int * sIndexData = new int[sizeof(int) * indexSize];
XMemCopy(sIndexData, -1, srcIndex->data, srcIndex->devID, sizeof(int) * indexSize);
for (int i = 0; i < indexSize; i++) {
int srcIndexValue = sIndexData[i] * stride;
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = (int *)srcIndex->data;
delete[] sIndexData;
}
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum, srcStrideNum);
}
#endif // USE_CUDA
......
......@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength); \
} \
for (int j = 1; j < strideNum / 32; j++) { \
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength); \
const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength); \
vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength)); \
vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength)); \
vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength)); \
......@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)
else { \
/* data is separated */ \
for(int i = 0; i < blockNum; i++){ \
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){ \
for(int j = 0; j < stride / 32; j++){ \
DTYPE * ip = (DTYPE*)input->data + blockSize * i; \
DTYPE * op = (DTYPE*)output->data + stride * i; \
VectorBuffer vecBuf[4]; \
......
......@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
int num = input->dimSize[dim];
_ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0);
_ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
}
/*
......
......@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
}
for(int j = 1; j < strideNum / 32; j++){
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
......@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{
//data is separated
for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
for(int j = 0; j < stride / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
......@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
VectorBuffer vecBuf[4];
for(int k = 0; k < 4; k++){
vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
}
for(int k = 1; k < strideNum; k++){
DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
......
......@@ -16,11 +16,12 @@
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
* $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
*/
#include "ReduceSumAll.h"
#include "ReduceSum.h"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -42,55 +43,70 @@ int * getDimSize(const XTensor * tensor, int n)
/*
sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor
<< return - the total summation
<< target - the total summation
*/
DTYPE _ReduceSumAll(const XTensor * source)
void _ReduceSumAll(const XTensor * source, XTensor * target)
{
int dims[2] = {1, source->unitNum};
int one = 1;
CheckNTErrors((source->devID == target->devID || (source->devID < 0 && target->devID < 0)),
"This code must be run on the same device!");
CheckNTErrors((source && target), "Empty input or output tensors!");
CheckNTErrors((target->order == 0), "Incorrect target tensor sizes!");
CheckNTErrors((target->unitNum == 1), "Illegal dimension to reduce!");
CheckNTErrors((source->dataType == target->dataType), "Unmatched data types!");
XTensor * all = NewTensorBufV2(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
XTensor * result = NewTensorBufV2(1, &one, source->dataType, 1.0F, source->devID, source->mem);
int dims[1] = {source->unitNum};
_CopyValues(source, all);
_ReduceSum(all, result, 1);
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
DTYPE r = result->Get1D(0);
_CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(result);
DelTensorBuf(all);
}
return r;
/*int order = source->order;
DTYPE summation;
XTensor * big = NewTensor(source);
_CopyValues(source, big);
for(int i = order - 1; i >= 0; i--) {
if(i == 0)
big->Reshape(1, big->unitNum);
/*
sum all the items of the tensor (It should be optimized!)
>> source - the inpute tensor
<< value - the total summation
*/
void _ReduceSumAll(const XTensor * source, DTYPE * value)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
int leadingDim = big->order - 1;
int * dimSize;
dimSize = getDimSize(big, leadingDim);
XTensor * little = NewTensorV2(big->order - 1, dimSize, source->dataType, source->denseRatio,
source->devID, source->mem);
/* call _ReduceSum function */
_ReduceSumAll(source, target);
*value = target->Get0D();
_ReduceSum(big, little, leadingDim);
DelTensorBuf(target);
}
delete big;
delete dimSize;
/*
sum all the items of the tensor
>> source - the inpute tensor
<< return - the total summation
*/
XTensor ReduceSumAll(const XTensor & source)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source.isSparse) ? 1.0F : source.denseRatio;
XTensor target(0, dimSize, source.dataType, dr, source.devID, source.mem);
target.SetTMPFlag();
big = NewTensor(little);
_CopyValues(little, big);
/* call _ReduceSum function */
_ReduceSumAll(&source, &target);
delete little;
/* tensor connection */
if (source.enableGrad) {
XLink::MakeLink(&source, NULL, &target, REDUCE_REDUCESUMALL);
}
summation = big->Get1D(0);
delete big;
return summation;*/
/* destroy variables */
delete[] dimSize;
return target;
}
/*
......@@ -98,9 +114,11 @@ sum all the items of the tensor
>> source - the inpute tensor
<< return - the total summation
*/
DTYPE ReduceSumAll(const XTensor & source)
DTYPE ReduceSumAllValue(const XTensor & source)
{
return _ReduceSumAll(&source);
XTensor target;
target = ReduceSumAll(source);
return target.Get0D();
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -16,7 +16,7 @@
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
* $Created by: LI Yinqqiao (email: li.yin.qiao.2012@hotmail.com) 2020-01-09
*/
......@@ -28,10 +28,16 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sum all the items of the tensor */
DTYPE _ReduceSumAll(const XTensor * source);
void _ReduceSumAll(const XTensor * source, XTensor * target);
/* sum all the items of the tensor */
DTYPE ReduceSumAll(const XTensor & source);
void _ReduceSumAll(const XTensor * source, DTYPE * target);
/* sum all the items of the tensor */
XTensor ReduceSumAll(const XTensor & source);
/* sum all the items of the tensor */
DTYPE ReduceSumAllValue(const XTensor & source);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim)
if (i < dim)
dimSize[i] = tensor->GetDim(i);
else if (i > dim)
dimSize[i] = tensor->GetDim(i-1);
dimSize[i] = tensor->GetDim(i - 1);
else if (i == dim)
dimSize[i] = count;
}
......@@ -160,7 +160,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
if (i < dim)
dimSize[i] = tensor->GetDim(i);
else if (i > dim)
dimSize[i] = tensor->GetDim(i-1);
dimSize[i] = tensor->GetDim(i - 1);
else if (i == dim)
dimSize[i] = count;
}
......
......@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void _Stack(const TensorList * smalls, XTensor * t, int dim);
/* stack small tensors into a big tensor along with a dimension (return an XTensor structure) */
XTensor Stack(const TensorList &list, int leadingDim);
XTensor Stack(const TensorList &list, int dim);
/* stack small tensors into a big tensor along with a dimension */
void Stack(const TensorList &smalls, XTensor &t, int dim);
......
......@@ -35,8 +35,9 @@ get the top-k items along a given dimension
>> index - index of the top-k items
>> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
{
dim = MODX(dim, a->order);
......@@ -58,7 +59,7 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
if (a->devID >= 0 || b->devID >= 0) {
#ifdef USE_CUDA
_CudaTopK(a, b, index, dim, k);
_CudaTopK(a, b, index, dim, k, isSorted);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......@@ -116,15 +117,16 @@ get the top-k items along a given dimension
>> index - index of the top-k items
>> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted)
{
dim = MODX(dim, a.order);
if(a.dimSize[dim] <= k)
_Sort(&a, &b, &index, dim);
else
_TopK(&a, &b, &index, dim, k);
_TopK(&a, &b, &index, dim, k, isSorted);
/* tensor connection */
//TensorList list(2);
......
......@@ -374,9 +374,10 @@ get the top-k items
>> minValue - min value of an item
>> output - the output data array
>> index - the output index array
>> isSorted - indicates whether the k items are sorted
*/
template<class T> __global__
void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index)
void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index, bool isSorted)
{
__shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 512 * sizeof(T)) / sizeof(CudaHeapNode<T>)];
__shared__ T eachHeapMaxValue[512];
......@@ -479,11 +480,24 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
int offset = stride * k * blockIndex + offsetInBlock;
T * dOutput = output + offset;
int * indexOutput = index + offset;
for (int q = 0; q < k; ++q){
if (isSorted)
{
for (int q = k - 1; q >= 0; q--) {
dOutput[stride * q] = ansHeapData.items[0].value;
indexOutput[stride * q] = ansHeapData.items[0].index;
ansHeapData.items[0] = ansHeapData.items[ansHeapData.count - 1];
ansHeapData.count--;
ansHeapData.Down(0);
}
}
else
{
for (int q = 0; q < k; ++q) {
dOutput[stride * q] = ansHeapData.items[q].value;
indexOutput[stride * q] = ansHeapData.items[q].index;
}
}
}
}
......@@ -803,8 +817,9 @@ get the top-k items along a given dimension
>> index - index of the top-k items
>> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting
>> isSorted - indicates whether the k items are sorted
*/
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted)
{
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!");
CheckNTErrors((a->order == b->order), "Unmatched input tensors!");
......@@ -846,7 +861,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
if (a->dataType == DEFAULT_DTYPE) {
KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
(DTYPE*)b->data, (int*)index->data);
(DTYPE*)b->data, (int*)index->data, isSorted);
}
else {
ShowNTErrors("TODO!");
......@@ -882,6 +897,10 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
KernelTopKRadixSelect<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> (goutput, stride, strideNumA, blockNum, k, DTYPE_MIN, (DTYPE *)b->data, (int *)index->data, stride * strideNumA * blockNum);
deconvert2floatV2 <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> ((unsigned int *)a->data, (float *)goutput, stride, strideNumA, blockNum, strideNumA*blockNum*stride);
if (isSorted)
{
ShowNTErrors("TODO!");
}
}
}
......
......@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* get the top-k items along a given dimension */
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted);
#endif // USE_CUDA
......
......@@ -27,10 +27,10 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the top-k items along a given dimension */
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k, bool isSorted = false);
/* get the top-k items along a given dimension */
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k, bool isSorted = false);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -78,6 +78,8 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
if(m->mem == NULL)
delete[] (char*)m->data;
else
m->mem->Release(m->data, m->GetDataSizeInChar(), m->signature);
m->dataHost = NULL;
m->data = GPUData + p;
......@@ -94,7 +96,36 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
#endif
}
/* copy the data from GPU memory to CPU memory */
/* copy the data from GPU memory to CPU memory (memory pool) */
void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem)
{
#ifdef USE_CUDA
CheckNTErrors((tensor->devID >= 0), "Cannot do cpu-flush on matrices that are already on CPU.");
/* compute the requried memory size */
int size = 0;
if (tensor->isSparse)
size = sizeof(int) + (sizeof(int) + tensor->unitSize) * tensor->unitNumNonZero;
else
size = tensor->unitSize * tensor->unitNum;
char * CPUData = CPUMem != NULL ? (char*)CPUMem->Alloc(CPUMem->devID, size):
(char*)XMemAlloc(devID, size);
/* copy from CPU memory to GPU memory */
cudaMemcpy(CPUData, tensor->data, size, cudaMemcpyDeviceToHost);
if (tensor->dataHost != NULL)
delete[](char*)tensor->dataHost;
tensor->dataHost = NULL;
tensor->mem->Release(tensor->data, tensor->GetDataSizeInChar(), tensor->signature);
tensor->data = CPUData;
tensor->devID = CPUMem != NULL ? CPUMem->devID : devID;
tensor->mem = CPUMem;
#endif
}
/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
void CudaGPUToCPUFlush(XTensor * tensor)
{
CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
......
......@@ -31,7 +31,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* flush a list of XTensor to GPU memory */
void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem);
/* copy the data from GPU memory to CPU memory */
/* copy the data from GPU memory to CPU memory (memory pool) */
void CudaGPUToCPUFlush(XTensor * tensor, int devID, XMem * CPUMem);
/* copy the data from GPU memory to CPU memory ((dataHost)) and do not delete the data */
void CudaGPUToCPUFlush(XTensor * tensor);
#endif // USE_CUDA
......
......@@ -293,7 +293,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
CheckNTErrors((gold != NULL), "The gold standard cannot be empty!");
CheckNTErrors((gold != NULL || lossName == NOLOSS), "The gold standard cannot be empty!");
if(leadDim < 0)
leadDim = y->order - 1;
......
......@@ -33,6 +33,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
XTensor LogSoftmax(const XTensor &x, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
......
......@@ -358,21 +358,21 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossBuf);
_ReduceSumAll(lossBuf, &loss);
if(reduceWay == REDUCE_MEAN) {
int nonZeroNum;
DTYPE nonZeroNum;
if(padding == NULL) {
nonZeroNum = lossBuf->unitNum;
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
}
loss = loss / (DTYPE)nonZeroNum;
loss = loss / nonZeroNum;
}
else if(reduceWay == REDUCE_SUM) {
/* don't need to do anything */
......@@ -675,8 +675,9 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
DTYPE nonZeroNum;
_ReduceSumAll(tmp, &nonZeroNum);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
delete tmp;
}
else {
......
......@@ -123,21 +123,21 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
_CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
loss = _ReduceSumAll(lossBuf);
_ReduceSumAll(lossBuf, &loss);
if(reduceWay == REDUCE_MEAN) {
int nonZeroNum;
DTYPE nonZeroNum;
if(padding == NULL) {
nonZeroNum = lossBuf->unitNum;
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
nonZeroNum = (int)_ReduceSumAll(tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
}
loss = loss / (DTYPE)nonZeroNum;
loss = loss / nonZeroNum;
}
else if(reduceWay == REDUCE_SUM) {
/* don't need to do anything */
......@@ -199,8 +199,9 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
if(padding != NULL) {
XTensor * tmp = NewTensor(padding);
_IsNonZero(padding, tmp);
int nonZeroNum = (int)_ReduceSumAll(tmp);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
DTYPE nonZeroNum;
_ReduceSumAll(tmp, &nonZeroNum);
_ScaleAndShiftMe(dedy, (DTYPE)1.0/nonZeroNum);
delete tmp;
}
else {
......
......@@ -606,7 +606,7 @@ bool TestCopyIndexed4()
/*
case 5: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1,
In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
srcIndex = [0, 1], tgtIndex = [0, 2], copyNum = 2.
*/
bool TestCopyIndexed5()
......@@ -622,7 +622,7 @@ bool TestCopyIndexed5()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 2, 2) */
/* a output tensor of size (3, 2, 4) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
......@@ -749,6 +749,152 @@ bool TestCopyIndexed5()
#endif // USE_CUDA
}
/*
case 6: copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 4), dim = 2, indexSize = 2,
srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
*/
bool TestCopyIndexed6()
{
/* a input tensor of size (3, 2, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 2;
sDimSize[2] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 2, 4) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 2;
tDimSize[2] = 4;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE tData[3][2][4] = { { {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F} },
{ {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F}},
{ {5.0F, 5.0F, 5.0F, 5.0F},
{5.0F, 5.0F, 5.0F, 5.0F} } };
DTYPE answer[3][2][4] = { { {2.0F, 5.0F, 5.0F, 0.0F},
{3.0F, 5.0F, 5.0F, 2.0F} },
{ {4.0F, 5.0F, 5.0F, 1.0F},
{2.0F, 5.0F, 5.0F, 3.0F}},
{ {2.0F, 5.0F, 5.0F, -1.0F},
{0.0F, 5.0F, 5.0F, 1.0F} } };
int dim = 2;
int indexSize = 2;
int srcIndex[2] = {0, 2};
int tgtIndex[2] = {3, 0};
int copyNum = 1;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t1 = NewTensorV2(tOrder, tDimSize);
XTensor * t2 = NewTensorV2(tOrder, tDimSize);
XTensor * sIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensorV2(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s->SetData(sData, sUnitNum);
t1->SetData(tData, tUnitNum);
t2->SetData(tData, tUnitNum);
sIndex->SetData(srcIndex, indexUnitNum);
tIndex->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(s, t1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(s, t2, dim, sIndex, tIndex, copyNum);
/* check results */
cpuTest = _CheckData(t1, answer, tUnitNum) &&
_CheckData(t2, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensorV2(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensorV2(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetData(tData, tUnitNum);
tGPU2->SetData(tData, tUnitNum);
sIndexGPU->SetData(srcIndex, indexUnitNum);
tIndexGPU->SetData(tgtIndex, indexUnitNum);
/* call CopyIndexed function */
_CopyIndexed(sGPU, tGPU1, dim, srcIndex, indexSize, tgtIndex, copyNum);
_CopyIndexed(sGPU, tGPU2, dim, sIndexGPU, tIndexGPU, copyNum);
/* check results */
gpuTest = _CheckData(tGPU1, answer, tUnitNum) &&
_CheckData(tGPU2, answer, tUnitNum);
/* destroy variables */
delete s;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete sGPU;
delete tGPU1;
delete tGPU2;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t1;
delete t2;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -805,6 +951,15 @@ bool TestCopyIndexed()
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* case 5 test */
caseFlag = TestCopyIndexed6();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 6 failed!\n");
}
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: matrix multiplication of the two tensors.
In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMulBatched1()
{
......
......@@ -27,7 +27,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
In this case, (2, 2) * (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestMultiply1()
{
......@@ -149,6 +149,131 @@ bool TestMultiply1()
#endif // USE_CUDA
}
/*
case 2: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 3, 4) * (2, 1, 1) -> (2, 3, 4), alpha=0.
*/
bool TestMultiply2()
{
/* a source tensor of size (2, 3, 4) */
int sOrder1 = 3;
int * sDimSize1 = new int[sOrder1];
sDimSize1[0] = 2;
sDimSize1[1] = 3;
sDimSize1[2] = 4;
int sUnitNum1 = 1;
for (int i = 0; i < sOrder1; i++)
sUnitNum1 *= sDimSize1[i];
/* a source tensor of size (2, 1, 1) */
int sOrder2 = 3;
int * sDimSize2 = new int[sOrder2];
sDimSize2[0] = 2;
sDimSize2[1] = 1;
sDimSize2[2] = 1;
int sUnitNum2 = 1;
for (int i = 0; i < sOrder2; i++)
sUnitNum2 *= sDimSize2[i];
/* a target tensor of size (2, 3, 4) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
tDimSize[1] = 3;
tDimSize[2] = 4;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F} },
{ {3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F} } };
DTYPE sData2[2][1][1] = { { {1.0F} },
{ {-1.0F} } };
DTYPE answer[2][3][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
{3.0F, 2.0F, 1.0F, 0.0F},
{0.0F, 1.0F, 2.0F, 3.0F} },
{ {-3.0F, -2.0F, -1.0F, 0.0F},
{0.0F, -1.0F, -2.0F, -3.0F},
{-3.0F, -2.0F, -1.0F, 0.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensorV2(sOrder1, sDimSize1);
XTensor * s2 = NewTensorV2(sOrder2, sDimSize2);
XTensor * tMe = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
tMe->SetData(sData1, sUnitNum1);
s2->SetData(sData2, sUnitNum2);
/* call Multiply function */
MultiplyMe(*tMe, *s2, 0);
tUser = Multiply(*s1, *s2);
/* check results */
cpuTest = _CheckData(tMe, answer, 1e-4, tUnitNum) &&
_CheckData(&tUser, answer, 1e-4, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU1 = NewTensorV2(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensorV2(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tMeGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
tMeGPU->SetData(sData1, sUnitNum1);
sGPU2->SetData(sData2, sUnitNum2);
/* call Multiply function */
MultiplyMe(*tMeGPU, *sGPU2, 0);
tUserGPU = Multiply(*sGPU1, *sGPU2);
/* check results */
gpuTest = _CheckData(tMeGPU, answer, tUnitNum, 1e-4F) &&
_CheckData(&tUserGPU, answer, tUnitNum, 1e-4F);
/* destroy variables */
delete s1;
delete s2;
delete tMe;
delete sGPU1;
delete sGPU2;
delete tMeGPU;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s1;
delete s2;
delete tMe;
delete[] sDimSize1;
delete[] sDimSize2;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -170,6 +295,16 @@ bool TestMultiply()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestMultiply2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -141,6 +141,90 @@ bool TestReduceMax1()
#endif // USE_CUDA
}
/*
case 2: get the max value of the items along a dimension of the scalar tensor.
In this case,
(4) -> scalar, dim = 0
*/
bool TestReduceMax2()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 5.0F, 2.0F, 3.0F};
DTYPE answer[1] = {5.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceMax function */
_ReduceMax(s, t, 0);
tUser = ReduceMax(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
tGPU->SetZeroAll();
/* call ReduceMax function */
_ReduceMax(sGPU, tGPU, 0);
tUserGPU = ReduceMax(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -161,6 +245,15 @@ bool TestReduceMax()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMax2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -136,6 +136,85 @@ bool TestReduceMean1()
#endif // USE_CUDA
}
/* case 2: get the mean value along a dimension of the scalar tensor */
bool TestReduceMean2()
{
/* a tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE answer[1] = {1.5F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceMean function */
_ReduceMean(s, t, 0);
tUser = ReduceMean(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call ReduceMean function */
_ReduceMean(sGPU, tGPU, 0);
tUserGPU = ReduceMean(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -156,6 +235,15 @@ bool TestReduceMean()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMean2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* other cases test */
///*
//TODO!!
......
......@@ -607,6 +607,89 @@ bool TestReduceSum6()
#endif // USE_CUDA
}
/*
case 7: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(4) -> scalar, dim = 0
*/
bool TestReduceSum7()
{
/* a tensor of size (2, 4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a scalar */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE answer[1] = {6.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(s, t, 0);
tUser = ReduceSum(*s, 0);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 0);
tUserGPU = ReduceSum(*sGPU, 0);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
......@@ -673,6 +756,15 @@ bool TestReduceSum()
else
XPRINT(0, stdout, ">> case 6 passed!\n");
/* case 7 test */
caseFlag = TestReduceSum7();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 7 failed!\n");
}
else
XPRINT(0, stdout, ">> case 7 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -55,7 +55,7 @@ bool TestReduceSumAll1()
s->SetData(sData, sUnitNum);
/* call ReduceSumAll function */
summation = _ReduceSumAll(s);
summation = ReduceSumAllValue(*s);
/* check results */
cpuTest = (fabs(answer - summation) < 1e-4F);
......@@ -71,7 +71,7 @@ bool TestReduceSumAll1()
sGPU->SetData(sData, sUnitNum);
/* call ReduceSumAll function */
summation = _ReduceSumAll(sGPU);
summation = ReduceSumAllValue(*sGPU);
/* check results */
gpuTest = (fabs(answer - summation) < 1e-4F);
......
......@@ -240,6 +240,104 @@ bool TestReduceSumSquared2()
#endif // USE_CUDA
}
/*
case 3: squared sum of the items along a dimension of the scalar tensor.
For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (4) -> scalar, dim = 0.
*/
bool TestReduceSumSquared3()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
/* a shift tensor of size (1) */
int shiftOrder = 0;
int * shiftDimSize = new int[MAX_TENSOR_DIM_NUM];
int shiftUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE shiftData[1] = {-1.0F};
DTYPE answer[1] = {30.0F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor * shift = NewTensorV2(shiftOrder, shiftDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
shift->SetData(shiftData, shiftUnitNum);
t->SetZeroAll();
/* call ReduceSumSquared function */
_ReduceSumSquared(s, t, 0, shift);
tUser = ReduceSumSquared(*s, 0, *shift);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU = NewTensorV2(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU->SetData(shiftData, shiftUnitNum);
tGPU->SetZeroAll();
/* call ReduceSumSquared function */
_ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
tUserGPU = ReduceSumSquared(*sGPU, 0, *shiftGPU);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete shift;
delete sGPU;
delete tGPU;
delete shiftGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] shiftDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete shift;
delete[] sDimSize;
delete[] tDimSize;
delete[] shiftDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -264,10 +362,19 @@ bool TestReduceSumSquared()
caseFlag = TestReduceSumSquared2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestReduceSumSquared3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */
/*
......
......@@ -132,6 +132,104 @@ bool TestReduceVariance1()
#endif // USE_CUDA
}
/*
case 2: variance of the items along a dimension of the scalar tensor.
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
In this case, (4) -> scalar, dim = 0.
*/
bool TestReduceVariance2()
{
/* a input tensor of size (4) */
int sOrder = 1;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output scalar tensor */
int tOrder = 0;
int * tDimSize = new int[MAX_TENSOR_DIM_NUM];
int tUnitNum = 1;
/* a mean scalar tensor */
int meanOrder = 0;
int * meanDimSize = new int[MAX_TENSOR_DIM_NUM];
int meanUnitNum = 1;
DTYPE sData[4] = {0.0F, 1.0F, 2.0F, 3.0F};
DTYPE meanData[1] = {1.5F};
DTYPE answer[1] = {1.25F};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensorV2(sOrder, sDimSize);
XTensor * t = NewTensorV2(tOrder, tDimSize);
XTensor * mean = NewTensorV2(meanOrder, meanDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
mean->SetData(meanData, meanUnitNum);
t->SetZeroAll();
/* call ReduceVariance function */
_ReduceVariance(s, t, 0, mean);
tUser = ReduceVariance(*s, 0, *mean);
/* check results */
cpuTest = _CheckData(t, answer, tUnitNum) && _CheckData(&tUser, answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensorV2(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensorV2(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * meanGPU = NewTensorV2(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
meanGPU->SetData(meanData, meanUnitNum);
tGPU->SetZeroAll();
/* call ReduceVariance function */
_ReduceVariance(sGPU, tGPU, 0, meanGPU);
tUserGPU = ReduceVariance(*sGPU, 0, *meanGPU);
/* check results */
gpuTest = _CheckData(tGPU, answer, tUnitNum) && _CheckData(&tUserGPU, answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete mean;
delete sGPU;
delete tGPU;
delete meanGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] meanDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete mean;
delete[] sDimSize;
delete[] tDimSize;
delete[] meanDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -152,6 +250,15 @@ bool TestReduceVariance()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceVariance2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -32,7 +32,7 @@ spread a collection tensor to source tensor.
*/
bool TestSpread1()
{
/* a input tensor of size (2, 4, 3) */
/* a input tensor of size (4, 4, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
......
......@@ -215,6 +215,305 @@ bool TestSub2()
#endif // USE_CUDA
}
/* case 3: tensor subtraction c = a - b * \beta, which b is a scalar tensor */
bool TestSub3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a scalar */
int bOrder = 0;
int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
int bUnitNum = 1;
/* a tensor of size (2, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[1] = {-1.0F};
DTYPE beta = 2.0F;
DTYPE answer[2][4] = { {2.0F, 3.0F, 4.0F, 5.0F},
{6.0F, 7.0F, 8.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
bool TestSub4()
{
/* a tensor of size (3, 4, 2) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 4;
aDimSize[2] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (3, 4, 2) */
int cOrder = 3;
int * cDimSize = new int[cOrder];
cDimSize[0] = 3;
cDimSize[1] = 4;
cDimSize[2] = 2;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
{ {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
{ {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[3][4][2] = { { {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} },
{ {2.0F, 1.0F}, {-2.0F, -3.0F}, {-6.0F, -7.0F}, {-10.0F, -11.0F} },
{ {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F}, {2.0F, 3.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 5: tensor subtraction c = a - b * \beta, which b is a 1d tensor */
bool TestSub5()
{
/* a tensor of size (4, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 4;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (4, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 4;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
{4.0F, 5.0F, 6.0F, 7.0F },
{0.0F, -1.0F, -2.0F, -3.0F },
{-4.0F, -5.0F, -6.0F, -7.0F } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[4][4] = { {2.0F, 1.0F, 0.0F, -1.0F },
{6.0F, 5.0F, 4.0F, 3.0F },
{2.0F, -1.0F, -4.0F, -7.0F },
{-2.0F, -5.0F, -8.0F, -11.0F } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sub(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sub(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -244,6 +543,33 @@ bool TestSub()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSub3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSub4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSub5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */
/*
TODO!!
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "../core/utilities/CheckData.h"
#include "../core/arithmetic/SubDim.h"
#include "../XTensor.h"
#include "TSubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
{5.0F, 6.0F, 7.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 0);
_SubDim(cMe, b, 0);
cUser = SubDim(*a, *b, 0);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 0);
_SubDim(cMeGPU, bGPU, 0);
cUserGPU = SubDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
{3.0F, 6.0F, 7.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 1);
_SubDim(cMe, b, 1);
cUser = SubDim(*a, *b, 1);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 1);
_SubDim(cMeGPU, bGPU, 1);
cUserGPU = SubDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for SubDim Function */
bool TestSubDim()
{
XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSubDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSubDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __TEST_SUBDIM_H__
#define __TEST_SUBDIM_H__
#include "../core/arithmetic/SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for SubDim Function */
bool TestSubDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_SUBDIM_H__
......@@ -215,6 +215,305 @@ bool TestSum2()
#endif // USE_CUDA
}
/* case 3: tensor summation c = a + b * \beta, which b is a scalar tensor */
bool TestSum3()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a scalar */
int bOrder = 0;
int * bDimSize = new int[MAX_TENSOR_DIM_NUM];
int bUnitNum = 1;
/* a tensor of size (2, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[1] = {-1.0F};
DTYPE beta = 2.0F;
DTYPE answer[2][4] = { {-2.0F, -1.0F, 0.0F, 1.0F},
{2.0F, 3.0F, 4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 4: tensor summation c = a + b * \beta, which b is a 1d tensor */
bool TestSum4()
{
/* a tensor of size (3, 4, 2) */
int aOrder = 3;
int * aDimSize = new int[aOrder];
aDimSize[0] = 3;
aDimSize[1] = 4;
aDimSize[2] = 2;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (3, 4, 2) */
int cOrder = 3;
int * cDimSize = new int[cOrder];
cDimSize[0] = 3;
cDimSize[1] = 4;
cDimSize[2] = 2;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[3][4][2] = { { {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} },
{ {0.0F, -1.0F}, {-2.0F, -3.0F}, {-4.0F, -5.0F}, {-6.0F, -7.0F} },
{ {0.0F, 1.0F}, {2.0F, 3.0F}, {4.0F, 5.0F}, {6.0F, 7.0F} } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[3][4][2] = { { {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} },
{ {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F}, {-2.0F, -3.0F} },
{ {-2.0F, -1.0F}, {2.0F, 3.0F}, {6.0F, 7.0F}, {10.0F, 11.0F} } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 5: tensor summation c = a + b * \beta, which b is a 1d tensor */
bool TestSum5()
{
/* a tensor of size (4, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 4;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (4, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 4;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F },
{4.0F, 5.0F, 6.0F, 7.0F },
{0.0F, -1.0F, -2.0F, -3.0F },
{-4.0F, -5.0F, -6.0F, -7.0F } };
DTYPE bData[4] = {-1.0F, 0.0F, 1.0F, 2.0F};
DTYPE beta = 2.0F;
DTYPE answer[4][4] = { {-2.0F, 1.0F, 4.0F, 7.0F },
{2.0F, 5.0F, 8.0F, 11.0F },
{-2.0F, -1.0F, 0.0F, 1.0F },
{-6.0F, -5.0F, -4.0F, -3.0F } };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call Sum function */
cUser = Sum(*a, *b, beta);
/* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
/* call Sum function */
cUserGPU = Sum(*aGPU, *bGPU, beta);
/* check results */
gpuTest = _CheckData(&cUserGPU, answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete aGPU;
delete bGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
......@@ -244,6 +543,33 @@ bool TestSum()
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestSum3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* case 4 test */
caseFlag = TestSum4();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 4 failed!\n");
}
else
XPRINT(0, stdout, ">> case 4 passed!\n");
/* case 5 test */
caseFlag = TestSum5();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 5 failed!\n");
}
else
XPRINT(0, stdout, ">> case 5 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -64,7 +64,7 @@ bool TestXMemCase1()
}
for (int i = 0; i < testNum * scalar; i++) {
testxmemid++;
//testxmemid++;
int j = rand() % caseNum;
//fprintf(stderr, "%d %d %d\n", testxmemid, j, ok);
......
......@@ -35,7 +35,7 @@ bool Test()
wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong;
//wrong = !TestConvertDataType() || wrong;
wrong = !TestConvertDataType() || wrong;
wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论