Commit 2f7adb8c by liyinqiao

Merge with the branch of xuchen (NOT update the float16, this needs code review)…

Merge with the branch of xuchen (NOT update the float16, this needs code review) and fix the bugs in Gather function.
1. Support Reciprocal fucntion.
2. Fix the safe delete bugs in XDevice.
3. Support new API to convert the data type of tensor.
4. Support to show the memory usage of buffer memory.
5. Fix minor errors.
parent 9b2f6efa
......@@ -32,19 +32,12 @@
//#include <stdlib.h>
//#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
int main( int argc, const char ** argv )
{
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
......@@ -59,7 +52,5 @@ int main( int argc, const char ** argv )
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
//_CrtDumpMemoryLeaks();
return 0;
}
......@@ -73,7 +73,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
_SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else {
ShowNTErrors("Wrong activation function type!");
ShowNTErrors("Unsupported backward computation! TODO!");
}
_SumMe(dedx, tmp);
......
......@@ -70,7 +70,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
_SumMe(dedy, tmp);
}
else {
ShowNTErrors("Wrong activation function type!");
ShowNTErrors("Unsupported backward computation! TODO!");
}
//DelTensorBuf(tmp);
DelTensor(tmp);
......
......@@ -79,6 +79,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradNormalize(node, isEfficient);
else if (operID == MATH_POWER)
GradPower(node, isEfficient);
else if (operID == MATH_RECIPROCAL)
GradReciprocal(node, isEfficient);
else if (operID == MATH_SQRT)
GradSqrt(node, isEfficient);
else if (operID == MATH_SQUARE)
GradSquare(node, isEfficient);
else if (operID == MATH_SCALEANDSHIFT)
GradScaleAndShift(node, isEfficient);
else if (operID == MATH_SCALE)
......@@ -110,7 +116,7 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
else if (operID == MATH_MULANDSHIFT)
GradMulAndShift(node, isEfficient);
else{
ShowNTErrors("TODO!");
ShowNTErrors("Unsupported backward computation! TODO!");
}
}
......@@ -969,7 +975,100 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, p - 1.0F);
_ScaleAndShiftMe(tmp, p);
_ScaleMe(tmp, p);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED;
}
/*
gradient for reciprocal
for
c = reciprocal(a)
we have
dE/da = (dE/dc) * -a^(-2)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in an efficient manner
*/
void XMathGrad::GradReciprocal(XTensor* node, bool isEfficient)
{
XLink& income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for RECIPROCAL!");
XTensor* a = income.tails[0];
/* dE/da = (dE/dc) * -a^(-2) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -2.0F);
_NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED;
}
/*
gradient for sqrt
for
c = sqrt(a)
we have
dE/da = (dE/dc) * 2 * a
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in an efficient manner
*/
void XMathGrad::GradSqrt(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SQRT!");
XTensor * a = income.tails[0];
/* dE/da = (dE/dc) * 2 * a */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleMe(tmp, 2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED;
}
/*
gradient for square
for
c = square(a)
we have
dE/da = (dE/dc) * (1/2) * a^(-1/2)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in an efficient manner
*/
void XMathGrad::GradSquare(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SQUARE!");
XTensor * a = income.tails[0];
/* dE/da = (dE/dc) * (1/2) * a^(-1/2)*/
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -0.5F);
_ScaleMe(tmp, 0.5);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
......
......@@ -126,6 +126,18 @@ private:
static
void GradPower(XTensor * node, bool isEfficient);
/* gradient for power */
static
void GradReciprocal(XTensor* node, bool isEfficient);
/* gradient for sqrt */
static
void GradSqrt(XTensor* node, bool isEfficient);
/* gradient for square */
static
void GradSquare(XTensor* node, bool isEfficient);
/* gradient for ScaleAndShift */
static
void GradScaleAndShift(XTensor * node, bool isEfficient);
......
......@@ -44,7 +44,9 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
XLink &income = node->income;
int operID = income.typeID;
if (operID == MOVEMENT_COPYINDEXED)
if (operID == GETANDSET_CONVERTDATATYPE)
GradConvertDataType(node, isEfficient);
else if (operID == MOVEMENT_COPYINDEXED)
GradCopyIndexed(node, isEfficient);
else if (operID == MOVEMENT_GATHER)
GradGather(node, isEfficient);
......@@ -65,7 +67,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
else if (operID == SHAPE_UNSQUEEZE)
GradUnsqueeze(node, isEfficient);
else{
ShowNTErrors("TODO!");
ShowNTErrors("Unsupported backward computation! TODO!");
}
}
......@@ -83,6 +85,34 @@ void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficient)
GradSplitListPost(node, isEfficient);
}
/*
gradient computation for convertdatatype
for
b = convertdatatype(a)
we have
dE/da = convertdatatype(dE/db)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
{
XLink& income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CopyIndexed!");
XTensor* a = income.tails[0];
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
}
}
/*
gradient computation for copying indexed sub-tensors
for
......@@ -138,6 +168,7 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
......
......@@ -46,6 +46,10 @@ public:
private:
/* gradient computation for convertdatatype: b = convertdatatype(a) */
static
void GradConvertDataType(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
static
void GradCopyIndexed(XTensor * node, bool isEfficient);
......
......@@ -28,6 +28,7 @@
#include "XDevice.h"
#include "XGlobal.h"
#include "XThread.h"
#include "XUtility.h"
#include "XList.h"
/* the nts (NiuTrans.Tensor) namespace */
......@@ -48,23 +49,35 @@ XDevice::XDevice()
#ifdef USE_CUDA
MUTEX_INIT(cublasMutex);
isHandleReady = false;
isGenReady = false;
#endif
}
/* de-constructor */
XDevice::~XDevice()
{
if (!isInitialized)
return;
#ifdef USE_CUDA
MUTEX_DELE(cublasMutex);
if(isHandleReady)
if (isHandleReady) {
cublasDestroy(cublasHandle);
curandDestroyGenerator(gen);
isHandleReady = false;
}
if (isGenReady) {
curandDestroyGenerator(gen);
isGenReady = false;
}
#endif
}
/* initialize it and get the device information */
void XDevice::Init(int myDevID)
{
if (isInitialized)
return;
Clear();
devID = myDevID;
......@@ -84,6 +97,7 @@ void XDevice::Init(int myDevID)
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, seed);
isGenReady = true;
if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
......@@ -140,6 +154,13 @@ void XDevice::Clear()
{
devID = -100;
memSize = 0;
name[0] = 0;
name2[0] = 0;
isUVASupported = false;
// TODO: cublasDestroy(cublasHandle);
#ifdef USE_CUDA
GPUWarpSize = 0;
memset(GPUMaxGridSize, 0, sizeof(int) * 3);
......@@ -147,11 +168,42 @@ void XDevice::Clear()
GPUMaxThreadNum = 0;
name[0] = 0;
name2[0] = 0;
MUTEX_DELE(cublasMutex);
if (isHandleReady) {
cublasDestroy(cublasHandle);
isHandleReady = false;
}
if (isGenReady) {
curandDestroyGenerator(gen);
isGenReady = false;
}
if (stream != NULL) {
delete stream;
stream = NULL;
}
#endif
isInitialized = false;
}
isUVASupported = false;
// TODO: cublasDestroy(cublasHandle);
void XDevice::Reset()
{
XMem * mem = GMems.GetMem(devID);
mem->Free();
int devIDReset = devID;
Clear();
#ifdef USE_CUDA
if (devIDReset >= 0) {
int devIDBackup = -1;
cudaGetDevice(&devIDBackup);
cudaSetDevice(devIDReset);
cudaDeviceReset();
cudaSetDevice(devIDBackup);
}
#endif
}
#ifdef USE_CUDA
......@@ -271,6 +323,7 @@ void XDevice::DelDeviceStream()
/* constructor */
XDevManager::XDevManager()
{
isInitialized = false;
Clear();
Init();
}
......@@ -284,6 +337,9 @@ XDevManager::~XDevManager()
/* initialization */
void XDevManager::Init()
{
if (isInitialized)
return;
srand((unsigned int)time(NULL));
Clear();
......@@ -311,6 +367,7 @@ void XDevManager::Init()
#endif
nGPU = GPUCount;
isInitialized = true;
}
/* clear it */
......@@ -321,6 +378,8 @@ void XDevManager::Clear()
for(int i = 0; i < MAX_GPU_NUM; i++)
GPUs[i].Clear();
isInitialized = false;
}
#ifdef USE_CUDA
......@@ -474,55 +533,6 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
return 0;
}
/*
split a string
>> inputString - a line of string
>> separator - separate by what
>> items - splitting result
<< return - how many items are there
*/
int SplitALine(char * inputString, const char * seperator, StrList* items)
{
items->Clear();
if(inputString == NULL || seperator == NULL)
return 0;
int inputLen = (int)strlen(inputString);
int sepLen = (int)strlen(seperator);
if(inputLen == 0)
return 0;
if(sepLen == 0){
char * item = new char[inputLen + 1];
strcpy(item, inputString);
items->Add(item);
}
else{
char * p = inputString;
char * item = NULL;
while(p != NULL){
char * q = strstr(p, seperator);
if(q == NULL){
item = new char[inputLen - (p - inputString) + 1];
memcpy(item, p, inputLen - (p - inputString) + 1);
item[inputLen - (p - inputString)] = '\0'; // no use?
p = NULL;
}
else{
item = new char[q - p + 1];
memcpy(item, p, q - p);
item[q - p] = '\0';
p = q + sepLen;
}
items->Add(item);
}
}
return items->count;
}
/*
get device ids for the given device information
......
......@@ -112,6 +112,9 @@ public:
/* specify if the handle is initialized */
bool isHandleReady;
/* specify if the generator is initialized */
bool isGenReady;
/* generater of random numbers */
curandGenerator_t gen;
......@@ -131,6 +134,9 @@ public:
/* clear it */
void Clear();
/* reset it */
void Reset();
#ifdef USE_CUDA
/* get cublas handle */
cublasHandle_t * GetCublasHandle();
......@@ -181,6 +187,9 @@ public:
/* number of GPUs */
int nGPU;
/* indicates whether the the management of devices has been initialized */
bool isInitialized;
public:
/* constructor */
XDevManager();
......
......@@ -124,7 +124,14 @@ public:
void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */
T& operator[] (int i) const { return GetItem(i); };
T& operator[] (int i) {
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
if (i < 0)
return items[count + i];
else
return items[i];
};
T& Get(int i) const { return GetItem(i); };
void Set(int i, T item) { SetItem(i, item); };
};
......
......@@ -176,8 +176,9 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
/* free memory */
void XMem::Free()
{
for(int i = 0; i < blockNum; i++){
Free(devID, blocks[i].mem);
for (int i = 0; i < blockNum; i++) {
if (blocks != NULL)
Free(devID, blocks[i].mem);
}
delete[] blocks;
blocks = NULL;
......@@ -1499,18 +1500,24 @@ void XMem::CreateBLASHandle()
/* show profile of the memory pool */
void XMem::ShowMemUsage(FILE * file)
{
MTYPE used = 0;
MTYPE total = 0;
MTYPE blockUsed = 0;
MTYPE blockTotal = 0;
for(int i = 0; i < blockNum; i++){
if(blocks[i].mem != NULL){
used += blocks[i].used;
total += blocks[i].size;
blockUsed += blocks[i].used;
blockTotal += blocks[i].size;
}
}
fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)total/MILLION, (DTYPE)used/MILLION, (DTYPE)used/total);
MTYPE bufTotal = bufSize;
MTYPE bufUsed = bufUsed;
fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
}
#ifdef USE_CUDA
......
......@@ -53,6 +53,8 @@ const char * GetOPName(int type)
return "M_TAN";
else if (type == MATH_ROUND)
return "M_ROUND";
else if (type == MATH_RECIPROCAL)
return "M_RECIPROCAL";
else if (type == MATH_CLIP)
return "M_CLIP";
else if (type == MATH_DIV)
......
......@@ -44,8 +44,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1
#define MATH_RECIPROCAL MATH_ROUND + 1
#define MATH_CLIP MATH_ROUND + 1
#define MATH_CLIP MATH_RECIPROCAL + 1
#define MATH_DIV MATH_CLIP + 1
#define MATH_DIVDIM MATH_DIV + 1
#define MATH_MASK MATH_DIVDIM + 1
......
......@@ -677,6 +677,30 @@ XTensor XTensor::TypeAs(const XTensor input)
return ConvertDataType(*this, input.dataType);
}
/* return a tensor that datatype is integer */
XTensor XTensor::Int()
{
return ConvertDataType(*this, X_INT);
}
/* return a tensor that datatype is float */
XTensor XTensor::Float()
{
return ConvertDataType(*this, X_FLOAT);
}
/* return a tensor that datatype is float16 */
XTensor XTensor::Float16()
{
return ConvertDataType(*this, X_FLOAT16);
}
/* return a tensor that datatype is double */
XTensor XTensor::Double()
{
return ConvertDataType(*this, X_DOUBLE);
}
/* get the number of items in the data array */
int XTensor::GetSize() const
{
......@@ -1694,8 +1718,8 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
fprintf(file, "NULL");
}
if (!isSparse) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
if (dataType == DEFAULT_DTYPE) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i];
if(i == beg)
......@@ -1706,7 +1730,6 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
}
}
else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
int f = ((int*)d)[i];
if(i == beg)
......@@ -1716,7 +1739,6 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
}
}
else if (dataType == X_FLOAT16) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
DTYPE f = ((unsigned short*)d)[i];
if(i == beg)
......
......@@ -276,6 +276,18 @@ public:
/* return a tensor that datatype is same as the special tensor */
XTensor TypeAs(const XTensor input);
/* return a tensor that datatype is integer */
XTensor Int();
/* return a tensor that datatype is float */
XTensor Float();
/* return a tensor that datatype is float16 */
XTensor Float16();
/* return a tensor that datatype is double */
XTensor Double();
/* get the number of items in the data array */
int GetSize() const;
......
......@@ -851,4 +851,54 @@ void ResetGPUDevices()
#endif
}
/*
split a string
>> inputString - a line of string
>> separator - separate by what
>> items - splitting result
<< return - how many items are there
*/
int SplitALine(char* inputString, const char* seperator, StrList* items)
{
items->Clear();
if (inputString == NULL || seperator == NULL)
return 0;
int inputLen = (int)strlen(inputString);
int sepLen = (int)strlen(seperator);
if (inputLen == 0)
return 0;
if (sepLen == 0) {
char* item = new char[inputLen + 1];
strcpy(item, inputString);
items->Add(item);
}
else {
char* p = inputString;
char* item = NULL;
while (p != NULL) {
char* q = strstr(p, seperator);
if (q == NULL) {
item = new char[inputLen - (p - inputString) + 1];
memcpy(item, p, inputLen - (p - inputString) + 1);
item[inputLen - (p - inputString)] = '\0'; // no use?
p = NULL;
}
else {
item = new char[q - p + 1];
memcpy(item, p, q - p);
item[q - p] = '\0';
p = q + sepLen;
}
items->Add(item);
}
}
return items->count;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -59,6 +59,8 @@ extern double GetClockSec();
extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b);
int SplitALine(char* inputString, const char* seperator, StrList* items);
#ifdef USE_CUDA
extern void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size, cudaStream_t stream, int streamDevID);
#else
......
......@@ -32,10 +32,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
template <class T> __global__
void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
/* set each entry to its clip value */
void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
......
......@@ -68,6 +68,14 @@ T UnaryIsZero(T r)
return (r == 0.0) ? (T)1.0 : (T)0.0;
}
template<class T>
T UnaryReciprocal(T r)
{
if (r == 0)
ShowNTErrors("Zero does not have reciprocal value.");
return (T)(1 / r);
}
/* define three marco separately, specify the respective function names */
#ifdef USE_CUDA
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
......@@ -186,6 +194,7 @@ _SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, UnarySquare)
_SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
_SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
_SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
_SIMPLE_UNARY_FUNCTION(_Reciprocal, _CudaReciprocal, UnaryReciprocal)
#else
_SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
......@@ -202,6 +211,7 @@ _SIMPLE_UNARY_FUNCTION(_Square, UnarySquare)
_SIMPLE_UNARY_FUNCTION(_Sin, sin)
_SIMPLE_UNARY_FUNCTION(_Cos, cos)
_SIMPLE_UNARY_FUNCTION(_Tan, tan)
_SIMPLE_UNARY_FUNCTION(_Reciprocal, UnaryReciprocal)
#endif
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
......@@ -279,4 +289,9 @@ SIMPLE_UNARY_FUNCTION_ME(TanMe, _Tan)
SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
_SIMPLE_UNARY_FUNCTION_ME(_ReciprocalMe, _Reciprocal)
SIMPLE_UNARY_FUNCTION_ME(ReciprocalMe, _Reciprocal)
SIMPLE_UNARY_FUNCTION(Reciprocal, _Reciprocal, MATH_RECIPROCAL)
SIMPLE_UNARY_FUNCTION_VOID(Reciprocal, _Reciprocal, MATH_RECIPROCAL)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -142,6 +142,15 @@ T UnaryCudaTan(T x)
return (T)tan((float)x);
}
template<class T>
__device__
T UnaryCudaReciprocal(T x)
{
//if (x == 0)
//ShowNTErrors("Zero does not have reciprocal value.");
return (T)(1 / x);
}
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
template<class T> \
......@@ -155,7 +164,7 @@ void Kernel##funcName(T * a, T * b, int size) \
} \
void _Cuda##funcName(const XTensor * a, XTensor * b) \
{ \
CheckNTErrors((_IsSameShaped(a, b)), \
CheckNTErrors((_IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors(a->isSparse == false, "TODO!"); \
\
......@@ -208,6 +217,8 @@ SIMPLE_UNARY_FUNCTION_GPU(Sin, UnaryCudaSin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, UnaryCudaCos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, UnaryCudaTan)
SIMPLE_UNARY_FUNCTION_GPU(Reciprocal, UnaryCudaReciprocal)
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -75,6 +75,9 @@ void _CudaCos(const XTensor * a, XTensor * b);
/* set each entry to its tangent value */
void _CudaTan(const XTensor * a, XTensor * b);
/* set each entry to its reciprocal value */
void _CudaReciprocal(const XTensor * a, XTensor * b);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......
......@@ -236,6 +236,20 @@ XTensor Tan(const XTensor & a);
/* set every entry to its tangent value */
void Tan(const XTensor & a, XTensor & b);
/* set every entry to its reciprocal value */
void _Reciprocal(const XTensor * a, XTensor * b);
/* set every entry to its reciprocal value (do it on site)
keep the result in the input tensor a and return nothing */
void _ReciprocalMe(XTensor * a);
/* set every entry to its reciprocal value (do it on site)
keep the result in the input tensor a and return nothing */
void ReciprocalMe(XTensor & a);
/* set every entry to its reciprocal value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Reciprocal(const XTensor & a);
/* set every entry to its reciprocal value */
void Reciprocal(const XTensor & a, XTensor & b);
} // namespace nts(NiuTrans.Tensor)
#endif // end __UNARY_H__
\ No newline at end of file
......@@ -234,11 +234,11 @@ bool TestConvertDataType3()
a->SetData(data1, unitNum1);
/* call ConvertDataType function (We have not implemented this yet...) */
//_ConvertDataType(a, b);
//_ConvertDataType(b, c);
_ConvertDataType(a, b);
_ConvertDataType(b, c);
/* check results */
//cpuTest = _CheckData(a, data1, unitNum1, 1e-4F);
cpuTest = _CheckData(a, data1, unitNum1, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -264,7 +264,7 @@ bool TestConvertDataType3()
_ConvertDataType(eGPU, fGPU);
/* check results */
gpuTest = _CheckData(fGPU, answer, unitNum3, 1e-4F);
//gpuTest = _CheckData(fGPU, answer, unitNum3, 1e-4F);
/* destroy variables */
delete a;
......
......@@ -35,7 +35,7 @@ bool Test()
wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong;
wrong = !TestCos() || wrong;
//wrong = !TestConvertDataType() || wrong;
wrong = !TestConvertDataType() || wrong;
wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论