Commit c22e2e31 by liyinqiao

Merge code with Yuhao branch and update the size of memory pool.

parent 823abb4f
......@@ -41,9 +41,6 @@ using namespace nts;
void SmallTest();
void TransposeTest();
void LittleTest();
void T2TTest();
void T2TTest2();
void PowerTest();
int main( int argc, const char ** argv )
......@@ -168,127 +165,5 @@ void TransposeTest()
delete[] data;
}
void LittleTest()
{
int a = 5000;
int b = 100000;
int c = a*b;
printf("%d\n", c);
exit(1);
}
void T2TTest()
{
XTensor * input;
XTensor * weight;
XTensor * output;
XTensor * gold;
XTensor * dedy;
XTensor * dedx;
XTensor * dedxTmp;
XTensor * dedw;
XTensor * padding;
DTYPE loss;
int * dimSize = new int[2];
dimSize[0] = 256;
dimSize[1] = 10001;
int * dimSize2 = new int[3];
dimSize2[0] = 2;
dimSize2[1] = 31;
dimSize2[2] = 256;
int * dimSize3 = new int[3];
dimSize3[0] = 2;
dimSize3[1] = 31;
dimSize3[2] = 10001;
int * dimSize4 = new int[2];
dimSize4[0] = 2;
dimSize4[1] = 31;
input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
//weight = NewTensor(2, dimSize);
//dedw = NewTensor(2, dimSize);
//input = NewTensor(3, dimSize2);
//gold = NewTensor(3, dimSize3);
//output = NewTensor(3, dimSize3);
//dedy = NewTensor(3, dimSize3);
//dedx = NewTensor(3, dimSize3);
//dedxTmp = NewTensor(3, dimSize3);
//padding = NewTensor(2, dimSize4);
myRead(input, "x.txt", "x");
myRead(weight, "w.txt", "w");
myRead(gold, "gold.txt", "gold");
myRead(padding, "padding.txt", "padding");
XTensor inter;
inter = MMul(*input, *weight);
_Softmax(&inter, output, 2);
//_LogMe(output);
loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
printf("loss: %f\n", loss);
_CrossEntropyBackward(dedy, output, gold, NULL);
//_CrossEntropyBackward(dedy, output, gold, NULL, padding);
myDump(dedy, "dedy.txt", "dedy");
_SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
_Sub(output, gold, dedxTmp);
myDump(dedx, "dedx.txt", "dedx");
dedx->Dump(stderr, "dedx", 200);
dedxTmp->Dump(stderr, "dedxTmp", 200);
input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
_MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
myDump(dedw, "dedw.txt", "dedw");
}
void T2TTest2()
{
int dimSize[3];
dimSize[0] = 161;
dimSize[1] = 47;
dimSize[2] = 10001;
XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
//XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
//myRead(probs, "probs.txt", " ");
_SetDataFixedFloat(probs, 1.0F);
probs->Reshape(1, probs->unitNum);
DTYPE sum = _ReduceSumAll(probs);
printf("%e\n", sum);
//XTensor tmp;
//tmp = IsNonZero(*probs);
//DTYPE nonZeroNum = ReduceSumAll(tmp);
//printf("%f\n", nonZeroNum);
//
//DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
//printf("%e\n", gpu);
}
......@@ -39,7 +39,7 @@ where a is a tensor and b is a row vector
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -75,7 +75,7 @@ where a is a tensor and b is a colum vector
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......
......@@ -78,7 +78,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
......
......@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor summation of a tensor and a row vector
c = a + b * \beta
c = a + b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
......
......@@ -209,4 +209,4 @@ XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
return onehot;
}
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -153,4 +153,4 @@ void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot,
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -45,8 +45,6 @@ void _Select(const XTensor * a, XTensor * c, int* index, int dim)
int stride = 1;
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
printf("\n%d %d\n", a->order - dim - 1,stride);
int copyTimes = 1;
for (int i = 0; i < dim; i++)
{
......@@ -94,12 +92,46 @@ void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
}
/*
c = select(a)
>> a - input tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
<< return - the result of the generated tensor with selected data
*/
/*XTensor Select(const XTensor &a, int* index, int dim)
XTensor Select(const XTensor &a, XTensor &index, int dim)
{
int order = a.order;
int * dimSize = new int[order];
}*/
CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
for (int i = 0; i < a.order; i++) {
if (i == dim) {
dimSize[i] = index.dimSize[0];
}
else
dimSize[i] = a.dimSize[i];
}
float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
/* call _SelectRange function */
_Select(&a, &c, &index, dim);
/* tensor connection */
if (a.enableGrad) {
XLink::MakeLink(&a, &index, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
}
/* destroy variables */
delete[] dimSize;
return c;
}
/*
generate a tensor with selected data in range[low,high] along the given dimension
......@@ -192,10 +224,12 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
_SelectRange(&a, &c, dim, low, high);
/* tensor connection */
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high);
if (a.enableGrad) {
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high);
}
/* destroy variables */
delete[] dimSize;
......
......@@ -36,7 +36,7 @@ void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
generate a tensor with selected data c = select(a) (returna a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Select(const XTensor &a, XTensor &indexCPU);
XTensor Select(const XTensor &a, XTensor &index, int dim);
/*
generate a tensor with selected data in range[low,high] along the given dimension
......
......@@ -78,7 +78,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors((_IsSameShaped(a, b)), \
CheckNTErrors((_IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
if (a->dataType == X_INT) { \
int * d = (int*)a->data; \
......@@ -113,7 +113,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)
if (a->devID >= 0) { \
ShowNTErrors("No GPU devices support!") \
} \
CheckNTErrors((_IsSameShaped(a, b)), \
CheckNTErrors((_IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
if (a->dataType == X_INT) { \
int * d = (int*)a->data; \
......@@ -170,8 +170,8 @@ XTensor funcName(const XTensor &a, T num)
_funcName(&a, &b, num); \
if(a.enableGrad){ \
XLink::MakeLink(&a, NULL, &b, operationId); \
XLink::AddParamToHead(&b, num); \
} \
XLink::AddParamToHead(&b, num); \
return b; \
} \
template XTensor funcName<int>(const XTensor&, int); \
......@@ -182,8 +182,8 @@ template XTensor funcName<double>(const XTensor&, double);
template<class T> \
void funcName(const XTensor &a, XTensor &b, T num) \
{ \
if (!b.isInit || !IsSameShaped(a, b)) { \
InitTensorV2(&b, &a); \
if (!b.isInit || !IsSameShaped(a, b)) { \
InitTensorV2(&b, &a); \
} \
_funcName(&a, &b, num); \
if (a.enableGrad) { \
......
......@@ -37,7 +37,7 @@ set each entry to its clip value (CUDA Kernel)
>> size - size of the data array
*/
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......
......@@ -33,28 +33,6 @@ gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
{
int * tgtIndex = new int[indexSize];
for(int i = 0; i < indexSize; i++)
tgtIndex[i] = i;
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
delete[] tgtIndex;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
......@@ -143,7 +121,10 @@ XTensor Gather(XTensor &s, XTensor &index)
_Gather(&s, &t, &index);
/* tensor connection */
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
if (s.enableGrad)
{
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
}
if(index.order > 1) {
int * dims = new int[index.order + 1];
......
......@@ -75,7 +75,6 @@ gather indexed sub-tensors(cuda version)
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
......
......@@ -27,9 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors accoding to the dimension */
......
......@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
}
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -203,8 +203,11 @@ XTensor funcName(const XTensor & input, int dim)
funcOp(&input, &output, dim); \
\
/* tensor connection */ \
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX); \
XLink::AddParamToHeadInt(&output, dim); \
if(input.enableGrad) \
{ \
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX); \
XLink::AddParamToHeadInt(&output, dim); \
} \
\
/* destroy variables */ \
delete[] dimSize; \
......
......@@ -742,7 +742,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096){
else if (stride != 1 && stride * blockNum > 4096) {
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
......
......@@ -20,7 +20,7 @@
*/
#include "VectorBuffer.h"
//#include "math.h"
#include "math.h"
namespace nts {
/* data size for each buffer */
int VectorBuffer::size()
......@@ -172,7 +172,6 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = MIN(a[i], this->values[i]);
printf("runhere");
}
return *this;
}
......
......@@ -19,7 +19,6 @@
* $Created by: ZHANG Yuhao (email: zhangyuhao@stu.neu.edu.cn) 2019-07-23
*/
//#include <cstring>
#include "../../XGlobal.h"
namespace nts {
......
......@@ -828,7 +828,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
else if (k < 22) workerNum = 128;
else if (k < 44) workerNum = 64;
else workerNum = 32;
int cudaGrids[3];
int cudaBlocks[3];
......
......@@ -74,7 +74,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
int blockSize = 1;
int blockNum = 1;
for (int i = leadDim + 1; i < y->order; i++)
for (int i = leadDim + 1; i < x->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -74,7 +74,7 @@ bool Test()
wrong = !TestSumDim() || wrong;
wrong = !TestTan() || wrong;
wrong = !TestTranspose() || wrong;
//wrong = !TestTopK() || wrong;
wrong = !TestTopK() || wrong;
wrong = !TestUnsqueeze() || wrong;
wrong = !TestXMem() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论