Commit 4336f2f9 by xuchen

1. add dropout function 2. add some function in tensor/core/math/unary file 3.…

1. add dropout function 2. add some function in tensor/core/math/unary file 3. merge with xiaotong-working branch
parent 98db6f24
......@@ -39,7 +39,6 @@ void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
using namespace GAN;
int main( int argc, const char ** argv )
{
......@@ -47,9 +46,7 @@ int main( int argc, const char ** argv )
//BackwardTest();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
......
......@@ -451,10 +451,10 @@ void XMathGrad::GradDivDim(XTensor * node)
DelTensorBuf(interGrad);
}
DelTensorBuf(aTMP1);
DelTensorBuf(aTMP2);
DelTensorBuf(bTMP);
DelTensorBuf(interGradTMP);
DelTensorBuf(bTMP);
DelTensorBuf(aTMP2);
DelTensorBuf(aTMP1);
node->visitMark = NODE_FINISHED;
}
......@@ -499,8 +499,8 @@ void XMathGrad::GradMatrixMul(XTensor * node)
a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));
GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
......@@ -760,7 +760,7 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
DelTensorBuf(interGrad);
}
DelTensor(bGradTMP);
DelTensorBuf(bGradTMP);
node->visitMark = NODE_FINISHED;
}
......@@ -796,6 +796,8 @@ gradient for normalize
*/
void XMathGrad::GradNormalize(XTensor * node)
{
ShowNTErrors("This is really a bad piece of code!!!");
XLink &income = node->income;
CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
......@@ -902,7 +904,7 @@ void XMathGrad::GradPower(XTensor * node)
_ScaleAndShiftMe(b, p);
_Multiply(node->grad, b, a->grad, 1.0F);
DelTensor(b);
DelTensorBuf(b);
node->visitMark = NODE_FINISHED;
}
......@@ -1229,7 +1231,7 @@ void XMathGrad::GradReduceSum(XTensor * node)
_Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad);
DelTensor(b);
DelTensorBuf(b);
node->visitMark = NODE_FINISHED;
}
......@@ -1274,10 +1276,10 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f);
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED;
}
......@@ -1323,10 +1325,10 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F);
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f);
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED;
}
......
......@@ -145,14 +145,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
lossGrad.Compute(gold, root, root->grad, loss);
}
}
/* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);;
XTensor * node = (XTensor*)nodes.Get(i);
if(node->mem != NULL){
CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
}
if(node->visitMark == NODE_FINISHED)
continue;
BackwardNode(node);
BackwardNode(node);
}
}
......
......@@ -116,15 +116,25 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked)
dot = dot + mask;
scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);
if(ignored > 0)
_SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
att = BMMul(scalar, vheads);
......
......@@ -111,6 +111,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order - 1] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order);
if(match){
for(int i = 0; i < input.order; i++){
......@@ -120,7 +123,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
}
/* we make positional embeddings first */
if(!match){
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
......@@ -130,8 +134,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
DelTensorBuf(posTMP);
}
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
......
......@@ -63,9 +63,6 @@ public:
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
/* positional embeddings */
XTensor posEmbedding;
public:
/* constructor */
T2TEmbedder();
......
......@@ -103,6 +103,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor fnn;
XTensor res;
llnum = -1;
/* we skip the residual connection for the first layer if
the encoder is used in language modeling. */
if(skipInputRes && i == 0){
/* self attention */
att = attentions[i].Make(x, x, x, mask);
......@@ -113,6 +117,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
x = attLayerNorms[i].Make(att);
}
else{
/* self attention */
att = attentions[i].Make(x, x, x, mask);
......@@ -123,6 +128,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* layer normalization */
x = attLayerNorms[i].Make(res);
llnum = -1;
}
/* fnn */
......
......@@ -27,7 +27,7 @@
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
......
......@@ -51,19 +51,24 @@ initialize the model
void T2TModel::InitModel(int argc, const char ** argv)
{
bool useMem = false;
int memSize = 0;
bool isMemFreeOTF = false;
LoadParamInt(argc, argv, "dev", &devID, -1);
LoadParamBool(argc, argv, "mem", &useMem, useMem);
LoadParamInt(argc, argv, "memsize", &memSize, 1024);
LoadParamBool(argc, argv, "lm", &isLM, true);
LoadParamBool(argc, argv, "mt", &isMT, false);
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
if(useMem){
delete mem;
mem = new XMem(devID);
mem = new XMem(devID, isMemFreeOTF ? FREE_ON_THE_FLY : UNI_FREE, (MTYPE)MILLION * 256, 1024, MILLION * 128);
mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
}
encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
encoder.InitModel(argc, argv, isLM, 0, devID, mem);
outputLayer.InitModel(argc, argv, devID, mem);
}
......@@ -83,8 +88,9 @@ XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
make the entire network (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
*/
void T2TModel::Make(XTensor &input, XTensor &output)
void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
{
XTensor encoding;
......@@ -98,18 +104,118 @@ void T2TModel::Make(XTensor &input, XTensor &output)
dims[input.order] = len;
XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
_SetDataLowTri(&mask, 1e9F, -1);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
padding.denseRatio, padding.devID, padding.mem);
/* mask of the padding */
_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true);
outputLayer.Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding2);
DelTensorBuf(padding3);
}
else{
ShowNTErrors("TODO!");
}
}
/*
get parameter matrics
>> list - the list that keeps the parameter matrics
*/
void T2TModel::GetParams(XList &list)
{
list.Clear();
list.Add(&outputLayer.w);
for(int i = 0; i < encoder.nlayer; i++){
list.Add(&encoder.fnns[i].w1);
list.Add(&encoder.fnns[i].b1);
list.Add(&encoder.fnns[i].w2);
list.Add(&encoder.fnns[i].b2);
list.Add(&encoder.attentions[i].wk);
list.Add(&encoder.attentions[i].wq);
list.Add(&encoder.attentions[i].wv);
list.Add(&encoder.fnnLayerNorms[i].w);
list.Add(&encoder.fnnLayerNorms[i].b);
list.Add(&encoder.attLayerNorms[i].w);
list.Add(&encoder.attLayerNorms[i].b);
}
list.Add(&encoder.embedder.w);
}
/*
dump the parameters
>> fn - where to keep the model
>> model - the model
*/
void T2TModel::Dump(const char * fn)
{
FILE * file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
GetParams(params);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Dump(file, "param:");
}
fclose(file);
XPRINT(0, stderr, "[INFO] model saved\n");
}
/* read the parameters */
void T2TModel::Read(const char * fn)
{
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot open the model file");
XList params(100);
GetParams(params);
for(int i = 0; i < params.count; i++){
XTensor * p = (XTensor*)params.Get(i);
p->Read(file, "param:");
}
fclose(file);
XPRINT(0, stderr, "[INFO] model loaded\n");
}
}
......@@ -72,9 +72,18 @@ public:
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
/* make the entire network (with the output softmax layer) */
void Make(XTensor &input, XTensor &output);
void Make(XTensor &input, XTensor &output, XTensor &padding);
/* get parameter matrics */
void GetParams(XList &list);
/* dump the parameters */
void Dump(const char * fn);
/* read the parameters */
void Read(const char * fn);
};
}
#endif
\ No newline at end of file
#endif
......@@ -37,12 +37,6 @@ namespace transformer
class T2TTrainer
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* buffer for loading words */
int * buf;
......@@ -75,6 +69,9 @@ public:
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */
int sBatchSize;
......@@ -88,6 +85,22 @@ public:
/* traing step number */
int nstep;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrics */
XList moments;
/* list of the 2nd order moment of the parameter matrics */
XList moments2nd;
public:
/* constructor */
T2TTrainer();
......@@ -101,17 +114,34 @@ public:
/* train the model */
void Train(const char * fn, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
/* load data to buffer */
int LoadBuf(FILE * file);
/* clear data buffer */
void ClearBuf();
/* load a batch of sequences */
int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * padding);
};
......
......@@ -27,6 +27,8 @@ namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{
......
......@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
/* show arguments */
void ShowParams(int argc, const char ** argv);
extern int llnum;
extern FILE * tf;
}
#endif
......@@ -38,20 +38,42 @@ int TransformerMain(int argc, const char ** argv)
ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, argv, "train", trainFN, "");
LoadParamString(argc, argv, "model", modelFN, "");
LoadParamString(argc, argv, "test", testFN, "");
LoadParamString(argc, argv, "output", outputFN, "");
T2TTrainer trainer;
trainer.Init(argc, argv);
T2TModel model;
model.InitModel(argc, argv);
if(strcmp(trainFN, "")){
T2TTrainer trainer;
trainer.Init(argc, argv);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, &model);
}
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, ""))
trainer.Test(testFN, outputFN, &model);
delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
fclose(tmpFILE);
......
......@@ -41,6 +41,7 @@ XDevManager GDevs;
XDevice::XDevice()
{
stream = NULL;
isInitialized = false;
Clear();
#ifdef USE_CUDA
......@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
#endif
}
isInitialized = true;
}
/* clear it */
......@@ -152,11 +154,14 @@ void XDevice::Clear()
/* get cublas handle */
cublasHandle_t * XDevice::GetCublasHandle()
{
if (!isInitialized)
Init(devID);
if(!isHandleReady){
MUTEX_LOCK(cublasMutex);
int devIDBackup = 0;
ProtectCudaDev(devID, devIDBackup);
CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess,
CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS,
"Cannot create the cublas handle.");
isHandleReady = true;
BacktoCudaDev(devID, devIDBackup);
......@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
{
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
......@@ -279,33 +287,13 @@ void XDevManager::Init()
exit(1);
}
cudaDeviceProp prop[64];
for(int i = 0; i < GPUCount; i++){
GPUs[i].Init(i);
cudaGetDeviceProperties(&prop[i], i);
GPUs[i].devID = i;
//GPUs[i].Init(i);
}
#ifdef USA_CUDA_P2P
for(int i = 0; i < GPUCount; i++){
cudaSetDevice(i);
for(int j = 0; j < GPUCount; j++){
if(i == j)
continue;
int access;
cudaDeviceCanAccessPeer(&access, i, j);
bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
if(access != 0){
CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
}
}
}
#endif
#endif
nGPU = GPUCount;
}
......@@ -351,6 +339,9 @@ into blocks
*/
int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
{
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......@@ -402,6 +393,9 @@ into blocks
*/
int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
{
if (!GPUs[devID].isInitialized)
GPUs[devID].Init(devID);
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
......
......@@ -67,6 +67,9 @@ public:
/* warp size of an (Navida) GPU */
int GPUWarpSize;
/* indicates whether the device class has been initialized */
bool isInitialized;
/*
max grid size (or number of blocks) of an (Navida) GPU
NOTE: the grid size is alone with three dimensions (x, y, z)
......
......@@ -147,6 +147,7 @@ extern bool useCUDA;
#define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
#define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
#define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true
......
......@@ -53,28 +53,14 @@ typedef long long INT_64;
#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
#define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* disired size of the block */
MTYPE sizeDesired;
};
/*
mode of runnig a memory pool
- UNI_FREE: free all memory space when the memory allocation is no use
- FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
*/
enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY};
struct MPieceNode;
/* header of a memory piece (FREE_ON_THE_FLY) */
struct MHeader
......@@ -96,6 +82,9 @@ struct MHeader
/* id of the memory block */
int blockID;
/* pointer to the index node */
MPieceNode * indexNode;
};
/* index of memory piece */
......@@ -112,6 +101,31 @@ struct MPieceNode
/* pointer to the head of a memory piece */
void * p;
/* pointer to the head of memory that is returned back to the user */
void * pReal;
/* header of the memory piece */
MHeader head;
};
/* memory block */
struct XMemBlock
{
/* pointer to where to start */
void * mem;
/* size of the block */
MTYPE size;
/* size of the used memory in this block */
MTYPE used;
/* desired size of the block */
MTYPE sizeDesired;
/* first head of the block */
MHeader * head;
};
/*
......@@ -138,6 +152,9 @@ public:
/* mode of running the memory pool */
MEMPOOL_MODE mode;
/* signature */
MTYPE signature;
/* indicates whether the memory allocation is static */
bool isStatic;
......@@ -194,13 +211,16 @@ public:
public:
/* index of the free memory pieces */
MPieceNode * freeMemIndex;
MPieceNode * memIndex;
/* for double buffering */
MPieceNode * memIndex2;
/* maximum number of index nodes */
INT_64 indexNodeNum;
INT_64 nodeNum;
/* count of the used nodes */
INT_64 indexNodeNumUsed;
INT_64 nodeNumUsed;
/* minimal size allocation for each index entry */
MTYPE * minSizeIndex;
......@@ -235,6 +255,9 @@ public:
/* free a piece of memory */
void Free(int myDevID, void * mem);
/* get signature */
MTYPE GetSignature();
/* use string as the name of the memory pool */
void SetName(const char * myName);
......@@ -282,10 +305,10 @@ public:
void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
/* release a piece of memory */
void Release(void * p);
void Release(void * p, MTYPE size, MTYPE code);
/* release a piece of memory */
void Release(int myDevID, void * p);
void Release(int myDevID, void * p, MTYPE size);
/* release a piece of memory in the buffer */
void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
......@@ -302,14 +325,20 @@ public:
/* find the index entry for allocation query */
int FindIndexEntry(MTYPE mySize);
/* remove an index node */
void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for available memory pieces */
void RemoveFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node */
void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* remove an index node for memory pieces in use */
void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* add an index node for available memory pieces */
void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
/* release a piece of memory as "free" */
void ReleaseStandard(int myDevID, void * p);
void ReleaseStandard(int myDevID, void * p, MTYPE size);
/* rebuild index to merge small fragments of memory and free the block with no use */
void RebuildIndex();
......@@ -379,6 +408,9 @@ public:
extern XMem * GMem;
extern int testxmemid;
extern void * recordp;
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
......@@ -29,10 +29,18 @@ const char * GetOPName(int type)
if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE";
else if (type == MATH_CEIL)
return "M_CEIL";
else if (type == MATH_EXP)
return "M_EXP";
else if (type == MATH_FLOOR)
return "M_FLOOR";
else if (type == MATH_LOG)
return "M_LOG";
else if (type == MATH_SQRT)
return "M_SQRT";
else if (type == MATH_SQUARE)
return "M_SQUARE";
else if (type == MATH_SIN)
return "M_SIN";
else if (type == MATH_COS)
......@@ -113,7 +121,9 @@ const char * GetOPName(int type)
return "S_TOPK";
}
else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH)
if (type == FUNC_DROPOUT)
return "F_DROPOUT";
else if (type == FUNC_HARDTANH)
return "F_HARDTANH";
else if (type == FUNC_IDENTITY)
return "F_IDENTITY";
......
......@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_BASE 0x00001000
#define MATH_ABSOLUTE MATH_BASE + 1
#define MATH_EXP MATH_ABSOLUTE + 1
#define MATH_LOG MATH_EXP + 1
#define MATH_SIN MATH_LOG + 1
#define MATH_CEIL MATH_ABSOLUTE + 1
#define MATH_EXP MATH_CEIL + 1
#define MATH_FLOOR MATH_EXP + 1
#define MATH_LOG MATH_FLOOR + 1
#define MATH_SQRT MATH_LOG + 1
#define MATH_SQUARE MATH_SQRT + 1
#define MATH_SIN MATH_SQUARE + 1
#define MATH_COS MATH_SIN + 1
#define MATH_TAN MATH_COS + 1
#define MATH_ROUND MATH_TAN + 1
......@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* activation functions */
#define FUNCTION_BASE DATA_BASE * 2
#define FUNC_HARDTANH FUNCTION_BASE + 1
#define FUNC_DROPOUT FUNCTION_BASE + 1
#define FUNC_HARDTANH FUNC_DROPOUT + 1
#define FUNC_IDENTITY FUNC_HARDTANH + 1
#define FUNC_LOGSOFTMAX FUNC_IDENTITY + 1
#define FUNC_RECTIFY FUNC_LOGSOFTMAX + 1
......
......@@ -162,6 +162,7 @@ XTensor::XTensor(const XTensor &reference)
devID = reference.devID;
mem = reference.mem;
data = reference.data;
signature = reference.signature;
/* what we really want to do is "reference.data = NULL;"
As "reference" is constant, we cannot reset reference.data
......@@ -221,7 +222,8 @@ XTensor::~XTensor()
void XTensor::Init()
{
id = -1;
mem = NULL;;
mem = NULL;
signature = 0;
data = NULL;
dataHost = NULL;
dataP = NULL;
......@@ -254,7 +256,7 @@ void XTensor::DestroyData()
else if(data != NULL && isInGlobalMem)
FreeData(this, mem);
else if(data != NULL)
mem->Release(data);
mem->Release(data, GetDataSizeInChar(), signature);
data = NULL;
if(dataHost != NULL)
......@@ -298,6 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
newTensor->SetTMP();
newTensor->data = data;
newTensor->dataHost = dataHost;
newTensor->signature = tensor.signature;
XLink::Replace(this, newTensor);
XLink::ClearOutgoing(this);
......@@ -1135,19 +1138,21 @@ resize a tensor with a specified tensor size
bool XTensor::Resize(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
/* free old mem */
if(data != NULL){
if (mem == NULL)
XMemFree(devID, data);
else
mem->Release(data);
mem->Release(data, GetDataSizeInChar(), signature);
}
signature = mem != NULL ? mem->GetSignature() : 0;
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
isInit = true;
bool filledData = true;
bool zeroData = false;
for(int i = 0; i < order; i++){
......@@ -1243,56 +1248,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
}
/*
resize a tensor with a specified tensor size (with no data filled)
>> myOrder - order of the tensor
>> myDimSize - the size of each dimension
>> myDataType - unit size (e.g., int, float, and double)
>> myDenseRatio - how often an element has non-zero value
<< return - succeeded or not
*/
bool XTensor::ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{
order = myOrder;
unitNum = 1;
unitNumNonZero = 0;
/* free old mem */
if(data != NULL && mem == NULL)
delete[] (char*)data;
bool filledData = true;
bool zeroData = false;
for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0)
filledData = false;
if(myDimSize[i] == 0)
zeroData = true;
unitNum *= dimSize[i];
}
data = NULL;
denseRatio = myDenseRatio;
isSparse = denseRatio < 1.0F ? true : false;
dataType = myDataType;
unitSize = GetUnitSize(dataType);
if(myDataType != DEFAULT_DTYPE)
isDefaultDType = false;
else
isDefaultDType = true;
if(zeroData){
unitNum = 0;
return false;
}
return true;
}
/*
resize a tensor by another one
>> myTensor - tensor for reference
*/
......@@ -1377,9 +1332,10 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
{
if (verbose > verboseLevel)
return;
......@@ -1437,28 +1393,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
}
if (!isSparse) {
if (dataType == DEFAULT_DTYPE) {
if (unitNum > 0) {
DTYPE f = *(DTYPE*)d;
fprintf(file, "%e", f);
}
int num = unitNum;
if (n > 0)
num = MIN(num, n);
for (int i = 1; i < num; i++) {
DTYPE * f = ((DTYPE*)d) + i;
fprintf(file, " %e", *f);
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i];
if(i == beg)
fprintf(file, "%e", f);
else
fprintf(file, " %e", f);
}
}
else {
ShowNTErrors("Cannot dump the tensor to the file in non-float values!");
ShowNTErrors("TODO!");
}
}
else {
int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
if (n > 0)
num = MIN(num, n);
if (beg + n > 0)
num = MIN(num, beg + n);
fprintf(file, "%d ", num);
for (int i = 0; i < num; i++) {
for (int i = beg; i < num; i++) {
int key = GetKeyInSparse(i);
DTYPE value = GetInSparse(i);
fprintf(file, "[%d]%e ", key, value);
......@@ -1481,13 +1435,14 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, verbose);
a.Dump(file, label, n, beg, verbose);
}
/*
......@@ -1670,6 +1625,8 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
tensor->isInGlobalMem = true;
}
}
tensor->signature = 0;
}
/*
......
......@@ -51,7 +51,6 @@ struct XLink;
#define MIN_TENSOR_SPLIT_LIST_NUM 1024
#define MIN_TENSOR_CAT_NUM 8
/* computation flags */
#define UNSAFE_BUT_FAST_MEM
#define FAST_MATRIX
......@@ -66,6 +65,9 @@ public:
/* memory pool */
XMem * mem;
/* signature of the memory pool */
MTYPE signature;
/* data array to keep the elements */
void * data;
......@@ -327,11 +329,6 @@ public:
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F);
/* resize a matrix with a specified matrix size (with no data filled) */
bool ResizeWithNoData(const int myOrder, const int * myDimSize,
const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
const float myDenseRatio = 1.0F);
/* resize a matrix by another one */
bool Resize(const XTensor * myTensor);
......@@ -339,11 +336,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */
void Read(FILE * file, const char * label = NULL);
......
......@@ -203,7 +203,7 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha);
XLink::AddParamToHead(&c, alpha);
}
else{
ShowNTErrors("Something is wrong!");
......
......@@ -204,7 +204,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadInt(&c, alpha);
XLink::AddParamToHead(&c, alpha);
}
else{
ShowNTErrors("Something is wrong!");
......
......@@ -50,7 +50,6 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h>
#include "../../XName.h"
#include "Unary.h"
......@@ -5,9 +26,18 @@
namespace nts{
DTYPE square(DTYPE x)
{
return x * x;
}
DTYPE round(DTYPE r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names */
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
......@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
......@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
#else
/* define three marco separately, specify the respective function names */
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
......@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_Log, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Round, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Sin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
*/
#include <math.h>
#include "../../XDevice.h"
#include "../../XName.h"
#include "Unary.h"
#include "Unary.cuh"
namespace nts {
__device__
DTYPE CudaSquare(DTYPE x)
{
return x * x;
}
__device__
DTYPE CudaRound(DTYPE r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
}
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
......@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
b[i] = (DTYPE)origFunc(a[i]); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size) \
void Kernel##funcName(__half * a, __half * b, int size) \
{ \
return; \
} \
......@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == DEFAULT_DTYPE) { \
Kernel##funcName << <blocks, threads >> > \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
} \
else if (a->dataType == X_FLOAT16) { \
Kernel##funcName << <blocks, threads >> > \
((__half*)a->data, (__half*)b->data, a->unitNum); \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, a->unitNum); \
} \
else { \
ShowNTErrors("TODO!"); \
......@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
} \
SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
SIMPLE_UNARY_FUNCTION_GPU(Log, log)
SIMPLE_UNARY_FUNCTION_GPU(Round, CudaRound)
SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
SIMPLE_UNARY_FUNCTION_GPU(Square, CudaSquare)
SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
//SIMPLE_UNARY_FUNCTION_GPU(Round, round)
}
\ No newline at end of file
......@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
void _CudaAbsolute(const XTensor * a, XTensor * b);
/* set each entry to its ceil value (CUDA Kernel) */
__global__
void KernelCeil(DTYPE * a, DTYPE * b, int size);
/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
__global__
void KernelCeil(__half * a, __half * b, int size);
/* set each entry to its ceil value */
void _CudaCeil(const XTensor * a, XTensor * b);
/* set each entry to its exponent value (CUDA Kernel) */
__global__
void KernelExp(DTYPE * a, DTYPE * b, int size);
......@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size);
/* set each entry to its exponent value */
void _CudaExp(const XTensor * a, XTensor * b);
/* set each entry to its floor value (CUDA Kernel) */
__global__
void KernelFloor(DTYPE * a, DTYPE * b, int size);
/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
__global__
void KernelFloor(__half * a, __half * b, int size);
/* set each entry to its floor value */
void _CudaFloor(const XTensor * a, XTensor * b);
/* set each entry to its logarithm value (CUDA Kernel) */
__global__
void KernelLog(DTYPE * a, DTYPE * b, int size);
......@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size);
/* set each entry to its logarithm value */
void _CudaLog(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
__global__
void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
__global__
void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
void _CudaRound(const XTensor * a, XTensor * b);
/* set each entry to its sqrt value (CUDA Kernel) */
__global__
void KernelSqrt(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
__global__
void KernelSqrt(__half * a, __half * b, int size);
/* set each entry to its sqrt value */
void _CudaSqrt(const XTensor * a, XTensor * b);
/* set each entry to its square value (CUDA Kernel) */
__global__
void KernelSquare(DTYPE * a, DTYPE * b, int size);
/* set each entry to its square value (CUDA Kernel) with float16 data type*/
__global__
void KernelSquare(__half * a, __half * b, int size);
/* set each entry to its square value */
void _CudaSquare(const XTensor * a, XTensor * b);
/* set each entry to its sine value (CUDA Kernel) */
__global__
void KernelSin(DTYPE * a, DTYPE * b, int size);
......@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size);
/* set each entry to its tangent value */
void _CudaTan(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
//__global__
//void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
//__global__
//void KernelRound(__half * a, __half * b, int size);
/* set each entry to its round value */
//void _CudaRound(const XTensor * a, XTensor * b);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......
......@@ -28,95 +28,103 @@ namespace nts{
/* set every entry to its absolute value */
void _Absolute(const XTensor * a, XTensor * b);
/*
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing */
void _AbsoluteMe(XTensor * a);
/*
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Absolute(const XTensor & a);
/* set every entry to its ceil value */
void _Ceil(const XTensor * a, XTensor * b);
/* set every entry to its ceil value (do it on site)
keep the result in the input tensor a and return nothing */
void _CeilMe(XTensor * a);
/* set every entry to its ceil value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Ceil(const XTensor & a);
/* set every entry to its exponent value */
void _Exp(const XTensor * a, XTensor * b);
/*
set every entry to its exponent value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its exponent value (do it on site)
keep the result in the input tensor a and return nothing */
void _ExpMe(XTensor * a);
/*
set every entry to its exponent value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its exponent value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Exp(const XTensor & a);
/* set every entry to its floor value */
void _Floor(const XTensor * a, XTensor * b);
/* set every entry to its floor value (do it on site)
keep the result in the input tensor a and return nothing */
void _FloorMe(XTensor * a);
/* set every entry to its floor value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Floor(const XTensor & a);
/* set every entry to its logarithm value */
void _Log(const XTensor * a, XTensor * b);
/*
set every entry to its logarithm value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its logarithm value (do it on site)
keep the result in the input tensor a and return nothing */
void _LogMe(XTensor * a);
/*
set every entry to its logarithm value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its logarithm value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Log(const XTensor & a);
/* set every entry to its round value */
void _Round(const XTensor * a, XTensor * b);
/* set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing */
void _RoundMe(XTensor * a);
/* set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Round(const XTensor & a);
/* set every entry to its sqrt value */
void _Sqrt(const XTensor * a, XTensor * b);
/* set every entry to its sqrt value (do it on site)
keep the result in the input tensor a and return nothing */
void _SqrtMe(XTensor * a);
/* set every entry to its sqrt value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sqrt(const XTensor & a);
/* set every entry to its square value */
void _Square(const XTensor * a, XTensor * b);
/* set every entry to its square value (do it on site)
keep the result in the input tensor a and return nothing */
void _SquareMe(XTensor * a);
/* set every entry to its square value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Square(const XTensor & a);
/* set every entry to its sine value */
void _Sin(const XTensor * a, XTensor * b);
/*
set every entry to its sine value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its sine value (do it on site)
keep the result in the input tensor a and return nothing */
void _SinMe(XTensor * a);
/*
set every entry to its sine value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its sine value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sin(const XTensor & a);
/* set every entry to its cosine value */
void _Cos(const XTensor * a, XTensor * b);
/*
set every entry to its cosine value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its cosine value (do it on site)
keep the result in the input tensor a and return nothing */
void _CosMe(XTensor * a);
/*
set every entry to its cosine value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its cosine value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Cos(const XTensor & a);
/* set every entry to its tangent value */
void _Tan(const XTensor * a, XTensor * b);
/*
set every entry to its tangent value (do it on site)
keep the result in the input tensor a and return nothing
*/
/* set every entry to its tangent value (do it on site)
keep the result in the input tensor a and return nothing */
void _TanMe(XTensor * a);
/*
set every entry to its tangent value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
/* set every entry to its tangent value (return a XTensor structure)
make a new tensor to keep the result and return it */
XTensor Tan(const XTensor & a);
/* set every entry to its round value */
//void _Round(const XTensor * a, XTensor * b);
/*
set every entry to its round value (do it on site)
keep the result in the input tensor a and return nothing
*/
//void _RoundMe(XTensor * a);
/*
set every entry to its round value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
//XTensor Round(const XTensor & a);
}
#endif //end __UNARY_H__
\ No newline at end of file
......@@ -77,7 +77,7 @@ void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize,
int targetBlockID = targetBlocks[i / blockSize];
int targetOffset = i % blockSize;
*(target + blockSize * targetBlockID + targetOffset) = source[i];
target[blockSize * targetBlockID + targetOffset] = source[i];
}
/*
......@@ -98,16 +98,6 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
//GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
//KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
// ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
}
else
if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
......
......@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.x = (unsigned int)vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
......@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
else threadMax = FLOAT_MIN;
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
......
......@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output,
if data storage is discontinuius ,use this way to reduce
*/
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
int strideNum, DTYPE * shift, DTYPE power, bool isExp)
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
int blockNum, DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.x = (unsigned int)vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "../XName.h"
#include <math.h>
#include <time.h>
#include "Dropout.h"
#include "Dropout.cuh"
#include "../core/arithmetic/Multiply.h"
#include "../core/math/ScaleAndShift.h"
namespace nts{ // namespace nts(NiuTrans.Tensor
/*
generate a random bernoulli number
*/
DTYPE RandomBernoulli(DTYPE prob)
{
return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
}
/*
dropout function
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
{
CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand(seed);
int unitNum = x->unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >=0 || y->devID >= 0){
_CudaDropout(x, y, maskTensor, scaleFactor);
DelTensorBuf(maskTensor);
delete[] maskArray;
return;
}
#endif
XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
_Multiply(x, maskTensor, inter);
_ScaleAndShift(inter, y, scaleFactor, 0);
DelTensorBuf(inter);
DelTensorBuf(maskTensor);
delete[] maskArray;
}
/*
dropout function (return a XTensor structure)
make a new tensor to keep the result and return it
During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".
Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
XTensor Dropout(const XTensor &x, DTYPE prob)
{
XTensor y(&x);
y.SetTMP();
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
/* generate a mask tensor again with special probability */
srand((unsigned int)time(NULL));
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor maskTensor(&x);
maskTensor.SetData(maskArray, unitNum);
XTensor inter;
inter = Multiply(x, maskTensor);
y = ScaleAndShift(inter, scaleFactor, 0);
delete[] maskArray;
///* tensor connection */
//XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
//XLink::AddParamToHead(&y, prob);
return y;
}
/*
backward computation of dropout function
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> prob - probability to set an element zero
*/
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
int unitNum = y->unitNum;
DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
/* generate a mask tensor again with special probability */
srand(seed);
DTYPE * maskArray = new DTYPE[unitNum];
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(prob);
XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
maskTensor->SetData(maskArray, unitNum);
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
DelTensorBuf(maskTensor);
delete[] maskArray;
return;
}
#endif
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < unitNum; i++)
dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
DelTensorBuf(maskTensor);
delete[] maskArray;
}
else
ShowNTErrors("TODO!");
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "Dropout.h"
#include "Dropout.cuh"
#include "Loss.cuh"
#include "../XDevice.h"
#ifdef USE_CUDA
// the CUDA stuff
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#endif
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
dropout function (Cuda kernel)
>> x - input data pointer
>> y - output data pointer
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
*/
__global__
void KernelDropoutCompute(DTYPE * x, DTYPE * y, DTYPE * m, DTYPE s, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
y[i] = x[i] * m[i] * s;
}
}
/*
dropout function (Cuda version)
>> x - input tensor
>> y - output tensor
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
*/
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * mask, DTYPE scaleFactor)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelDropoutCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
}
/*
backward computation of dropout function (Cuda kernel)
dE/dx = dE/dy * dy/dx
>> dedy - dE/dy
>> dedx - dE/dx
>> m - mask indicator to set zero
>> s - the scale factor
>> size - size of input/output
*/
__global__
void KernelDropoutBackward(DTYPE * dedy, DTYPE * dedx,
DTYPE * m, DTYPE s, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
dedx[i] = dedy[i] * m[i] * s;
}
}
/*
backward computation of dropout function (Cuda version)
dE/dx = dE/dy * dy/dx
>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> mask - mask tensor to set 0
>> scaleFactor - the scale factor
*/
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor)
{
int gridSize[3], blockSize[3];
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
KernelDropoutBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data, (DTYPE*)dedx->data,
(DTYPE*)mask->data, scaleFactor, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
}
#endif
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __DROPOUT_CUH__
#define __DROPOUT_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* dropout function (Cuda version) */
void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * r, DTYPE scaleFactor);
/* de/dx (Cuda version) */
void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
const XTensor * mask, DTYPE scaleFactor);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __DROPOUT_H__
#define __DROPOUT_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* dropout function */
void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
/* de/dx */
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE prob = 0.5);
} // namespace nts(NiuTrans.Tensor)
#endif // __DROPOUT_H__
\ No newline at end of file
......@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE),
"TODO!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
......
......@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
"TODO!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");
......
......@@ -48,19 +48,19 @@ loss function to measure the "number" of errors
/* compute the loss */
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss (log version) */
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element */
DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors */
void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#include "../XName.h"
#include <math.h>
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#include "Sigmoid.h"
#include "Sigmoid.cuh"
......
......@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */
/* sigmoid function y = 1/(1+exp(-x)) (Cuda version) */
void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
......
......@@ -45,20 +45,17 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){
if(i < leadDim)
dimSize[i] = -x->dimSize[i];
dimSize[i] = x->dimSize[i];
else if(i > leadDim)
dimSize[i - 1] = -x->dimSize[i];
dimSize[i - 1] = x->dimSize[i];
}
XMem * mem = x->mem;
XTensor * max = NULL;
XTensor * sum = NULL;
max = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);
max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
_ReduceMax(x, max, leadDim);
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
......@@ -114,18 +111,9 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
}
}
if(mem != NULL){
mem->ReleaseBuf(mem->devID, max->unitNum * max->unitSize);
mem->ReleaseBuf(mem->devID, sum->unitNum * sum->unitSize);
}
else{
XMemFree(max->devID, max->data);
XMemFree(sum->devID, sum->data);
max->data = NULL;
sum->data = NULL;
}
delete max;
delete sum;
DelTensorBuf(sum);
DelTensorBuf(max);
delete[] dimSize;
}
else
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#include "../XUtility.h"
#include "TDropout.h"
#include "../core/getandset/SetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test Dropout function.
*/
bool TestDropout1()
{
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 40;
dimSize[1] = 50;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetDataRand(0, 1);
y->SetZeroAll();
/* call Dropout function */
float prob = 0.2F;
int seed = 20;
_Dropout(x, y, seed, prob);
yUser = Dropout(*x);
/* check result */
int zeroNum1 = 0;
int zeroNum2 = 0;
float * data1 = (float*)y->data;
float * data2 = (float*)yUser.data;
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
}
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetDataRand(0, 1);
yGPU->SetZeroAll();
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
yUserGPU = Dropout(*xGPU);
/* check result */
zeroNum1 = 0;
zeroNum2 = 0;
data1 = (float*)y->data;
data2 = (float*)yUser.data;
for (int i = 0; i < unitNum; i++){
DTYPE tmp1 = data1[i];
DTYPE tmp2 = data2[i];
if(tmp1 == 0.0F)
zeroNum1 += 1;
if(tmp2 == 0.0F)
zeroNum2 += 1;
}
printf("CPU Test:\n");
printf("In tensor y, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
printf("In tensor yUser, there are %d units.\n", unitNum);
printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
/* destroy variables */
delete x;
delete y;
delete xGPU;
delete yGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: test Dropout function and backward computation.
*/
bool TestDropout2()
{
/* a input tensor of size (4, 5) */
int order = 2;
int * dimSize = new int[order];
dimSize[0] = 4;
dimSize[1] = 5;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
/* initialize variables */
_SetDataFixedFloat(x, 1.0F);
y->SetZeroAll();
dedx->SetZeroAll();
_SetDataFixedFloat(dedy, 1.0F);
/* call Dropout function */
float prob = 0.5F;
int seed = 1;
_Dropout(x, y, seed, prob);
_DropoutBackward(y, x, dedy, dedx, 1, prob);
/* check result */
y->Dump(stderr, "y");
dedx->Dump(stderr, "dedy");
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
_SetDataFixedFloat(xGPU, 1.0F);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
_SetDataFixedFloat(dedyGPU, 1.0F);
/* call Dropout function */
_Dropout(xGPU, yGPU, seed, prob);
_DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
/* check result */
yGPU->Dump(stderr, "yGPU");
dedxGPU->Dump(stderr, "dedyGPU");
/* destroy variables */
delete x;
delete y;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x;
delete y;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for Dropout Function */
bool TestDropout()
{
XPRINT(0, stdout, "[TEST DROPOUT] dropout function and its backward computation \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestDropout1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDropout2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
*/
#ifndef __TEST_DROPOUT_H__
#define __TEST_DROPOUT_H__
#include "../function/Dropout.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Dropout Function */
bool TestDropout();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_DROPOUT_H__
......@@ -19,6 +19,7 @@
* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
*/
#include<math.h>
#include "../core/math/ScaleAndShift.h"
#include "TLoss.h"
......@@ -62,7 +63,7 @@ bool TestLoss1()
error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -82,7 +83,7 @@ bool TestLoss1()
error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......@@ -140,7 +141,7 @@ bool TestLoss2()
error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -160,7 +161,7 @@ bool TestLoss2()
error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......@@ -226,7 +227,7 @@ bool TestLoss3()
error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
cpuTest = (fabs(error - answer) < 1e-4);
#ifdef USE_CUDA
/* GPU test */
......@@ -244,7 +245,7 @@ bool TestLoss3()
error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
gpuTest = (fabs(error - answer) < 1e-4);
/* destroy variables */
delete output;
......
......@@ -33,52 +33,130 @@ bool TestXMemCase1()
int blcokSize = 16;
int testNum = caseNum * 10;
for(int i = 0, scalar = 1; i < 3; i++){
XMem mem;
mem.Initialize(-1, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
srand(907);
int ** p = new int*[caseNum];
int * size = new int[caseNum];
for(int i = 0; i < caseNum; i++){
p[i] = NULL;
size[i] = rand() % (2*blcokSize);
}
int devIDs[2];
int devNum = 1;
devIDs[0] = -1;
/*if (GDevs.nGPU > 0) {
devIDs[1] = 0;
devNum = 2;
devIDs[0] = 0;
devNum = 1;
}*/
int * buf = new int[blcokSize * 10];
for (int id = 0; id < devNum; id++) {
int devID = devIDs[id];
for (int iter = 0, scalar = 1; iter < 3; iter++) {
XMem mem;
mem.Initialize(devID, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
srand(907);
int ** p = new int*[caseNum];
int * size = new int[caseNum];
for (int i = 0; i < caseNum; i++) {
p[i] = NULL;
size[i] = rand() % (2 * blcokSize);
}
for(int i = 0; i < testNum * scalar; i++){
int j = rand() % caseNum;
for (int i = 0; i < testNum * scalar; i++) {
testxmemid++;
//fprintf(stderr, "%d %d\n", testxmemid, ok);
int j = rand() % caseNum;
if(p[j] == NULL){
p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
for(int k = 0; k < size[j]; k++)
p[j][k] = j;
}
else{
mem.ReleaseStandard(mem.devID, p[j]);
for(int k = 0; k < size[j]; k++)
p[j][k] = -1;
p[j] = NULL;
}
if (p[j] == NULL) {
p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
for (int k = 0; k < size[j]; k++)
buf[k] = j;
XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
}
else {
mem.ReleaseStandard(mem.devID, p[j], size[j] * sizeof(int));
for (int k = 0; k < size[j]; k++)
buf[k] = -1;
XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
p[j] = NULL;
}
for(int k = 0; k < caseNum; k++){
if(p[k] != NULL){
for(int o = 0; o < size[k]; o++){
if(p[k][o] != k){
ok = false;
for (int k = 0; k < caseNum; k++) {
if (p[k] != NULL) {
XMemCopy(buf, -1, p[k], devID, sizeof(int) * size[k]);
for (int o = 0; o < size[k]; o++) {
if (buf[o] != k) {
ok = false;
}
}
}
}
/*MPieceNode * entry = NULL;
MPieceNode * node = NULL;
entry = mem.memIndex + mem.indexEntryNum + mem.FindIndexEntry(112);
int cc = 0;
node = entry->next;
while(node != NULL){
fprintf(stderr, "%d ", cc++);
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
}
else{
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
}
}
fprintf(stderr, "\n");*/
/*int ccc = 0;
bool hhh = recordp != NULL ? false : true;
for(int i = 0; i < mem.indexEntryNum; i++){
MPieceNode * entry = mem.memIndex + mem.indexEntryNum + i;
MPieceNode * last = entry;
MPieceNode * node = entry->next;
ccc = 0;
while(node != NULL){
CheckNTErrors(node->pre == last, "XSomething is wrong!");
CheckNTErrors(last->next == node, "XSomething is wrong!");
last = node;
ccc++;
if(node->pReal == recordp){
hhh = true;
}
if(node->size == 0){
MPieceNode * next = node->next;
node = next;
}
else{
CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
node = node->next;
}
}
}
if(!hhh){
int nnn = 0;
}*/
}
}
delete[] p;
delete[] size;
scalar *= 2;
delete[] p;
delete[] size;
scalar *= 2;
}
}
delete[] buf;
return ok;
}
......@@ -113,4 +191,4 @@ bool TestXMem()
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -76,6 +76,7 @@ bool Test()
wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong;
wrong = !TestDropout() || wrong;
wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity() || wrong;
wrong = !TestLogSoftmax() || wrong;
......
......@@ -69,6 +69,7 @@
#include "TUnsqueeze.h"
#include "TXMem.h"
#include "TDropout.h"
#include "THardTanH.h"
#include "TIdentity.h"
#include "TLogSoftmax.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论