Commit e5a709dc by xiaotong

add Transpose and buf fixes

parent e84e725e
......@@ -30,11 +30,16 @@
//#include <stdlib.h>
//#include <crtdbg.h>
void TransposeTest();
using namespace nts;
using namespace samplefnnlm;
int main( int argc, const char ** argv )
{
//TransposeTest();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-test"))
1;//Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
......@@ -81,3 +86,61 @@ int main( int argc, const char ** argv )
return 0;
}
void TransposeTest()
{
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x;
XTensor y;
int loops = 2000;
int B = 3 * 2 * 4;
int K = 8 * 1;
int N = 50;
int H = 512 * 4;
int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
cudaEvent_t ctime0;
cudaEvent_t ctime1;
cudaEvent_t ctime2;
cudaEvent_t ctime3;
float elapsedSplit = 0.0;
float elapsedMerge = 0.0;
cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3);
cudaEventRecord(ctime0, 0);
double time0 = GetClock();
for(int i = 0; i < loops; i++)
_Split(&x, &y, 2, K);
double time1 = GetClock();
cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0);
double time2 = GetClock();
for(int i = 0; i < loops; i++)
_Merge(&y, &x, 3);
double time3 = GetClock();
cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
fprintf(stderr, "split:%f merge:%f\n", time1 - time0, time3 - time2);
fprintf(stderr, "split:%f merge:%f\n", elapsedSplit, elapsedMerge);
}
......@@ -29,6 +29,7 @@
#include "XTensor.h"
#include "XDevice.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -37,6 +38,7 @@
using namespace nts;
void SmallTest();
void TransposeTest();
int main( int argc, const char ** argv )
{
......@@ -92,3 +94,35 @@ void SmallTest()
c.Dump(stderr, "c:");
d.Dump(stderr, "d:");
}
void TransposeTest()
{
XTensor a;
XTensor b;
int I = 2;
int J = 3;
InitTensor4D(&a, 2, 3, 4, 5);
int * dims = new int[a.order];
memcpy(dims, a.dimSize, sizeof(int) * a.order);
dims[I] = a.dimSize[J];
dims[J] = a.dimSize[I];
InitTensor(&b, 4, dims);
a.SetZeroAll();
b.SetZeroAll();
float * data = new float[a.unitNum];
for(int i = 0; i < a.unitNum; i++)
data[i] = (float)i;
a.SetData(data, a.unitNum, 0);
_Transpose(&a, &b, I, J);
b.Dump(stderr, "b:");
delete[] data;
}
......@@ -405,13 +405,10 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int
memset(gridSize, 0, sizeof(int) * 3);
memset(blockSize, 0, sizeof(int) * 3);
if(n <= 0 || m <= 0 || devID >= nGPU)
if(n <= 0 || m <= 0)
return 1;
if(devID < 0){
XPRINT(0, stderr, "WARNING! You are calling the grid and block size computation function on a CPU!");
return 0;
}
CheckNTErrors(devID >= 0 && devID < nGPU, "Invalid GPU device id!");
#ifdef USE_CUDA
......
......@@ -181,7 +181,10 @@ void XMem::Free(int myDevID, void * mem)
else{
#ifdef USE_CUDA
SetDevice(myDevID);
CheckNTErrors(cudaFree((char*)mem) == cudaSuccess, "Cannot free the memory.");
cudaError_t error = cudaFree((char*)mem);
if(error != cudaSuccess){
ShowNTErrors("Cannot free the memory.");
}
#else
ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif
......
......@@ -208,6 +208,9 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
#ifdef USE_CUDA
void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size, cudaStream_t stream, int streamDevID)
{
if(t == s)
return;
int devIDBackup = -1;
if(streamDevID >= 0 && (devIDT >= 0 || devIDS >= 0)){
CheckNTErrors((cudaGetDevice(&devIDBackup) == cudaSuccess), "Cannot get GPU device id!");
......@@ -220,17 +223,23 @@ void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size
return;
}
else if(devIDT >= 0 && devIDS < 0){
CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyHostToDevice, stream) == cudaSuccess),
"cudaMemcpyAsync error (cudaMemcpyHostToDevice)");
cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyHostToDevice, stream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyHostToDevice)");
}
}
else if(devIDT < 0 && devIDS >= 0){
CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToHost, stream) == cudaSuccess),
"cudaMemcpyAsync error (cudaMemcpyDeviceToHost)");
cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToHost, stream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyDeviceToHost)");
}
}
else{
//if(devIDT == devIDS){
CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToDevice, stream) == cudaSuccess),
"cudaMemcpyAsync error (cudaMemcpyDeviceToDevice)");
cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToDevice, stream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyDeviceToDevice)");
}
//}
/*else{
CheckNTErrors((cudaMemcpyPeerAsync(t, devIDT, s, devIDS, size, stream) == cudaSuccess),
......
......@@ -62,8 +62,12 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
"Unmatched tensor sizes!");
}
else if (i < leadingDimRDI){
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
"Unmatched tensor sizes!");
}
else if (i > leadingDimRDI) {
CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]),
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
"Unmatched tensor sizes!");
}
}
......@@ -126,7 +130,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
* It is extreamly hot these days and i cannot sleep well. Fortunately we had
* good lunch of Steamed Cold Noodles. This made me feel much better!
*/
#include "Transpose.h"
#include "Merge.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor transposition of dimensions i and j
b = transposed(a)
*/
void _Transpose(const XTensor * a, XTensor * b, const int i, const int j)
{
CheckNTErrors(a && b, "Empty tensors");
CheckNTErrors(a->order == b->order, "Wrong tensor orders");
CheckNTErrors(a->unitNum == b->unitNum && a->unitSize == b->unitSize, "Wrong tensor sizes");
CheckNTErrors(a->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(a->order > j && j >= 0, "index of dimension is out of scope!");
for(int k = 0; k < a->order; k++){
if(k == i){
CheckNTErrors(a->dimSize[k] == b->dimSize[j], "Wrong dimension size in transposition");
}
else if(k == j){
CheckNTErrors(a->dimSize[k] == b->dimSize[i], "Wrong dimension size in transposition");
}
else{
CheckNTErrors(a->dimSize[k] == b->dimSize[k], "Wrong dimension size in transposition");
}
}
if(i == j){
XMemCopy(b->data, b->devID, a->data, a->devID, b->unitNum * b->unitSize);
}
else{
int I = MIN(i, j);
int J = MAX(i, j);
int * dims = new int[a->order + 1];
for(int k = 0; k <= J; k++)
dims[k] = a->dimSize[k];
dims[J + 1] = -1;
for(int k = J + 1; k < a->order; k++)
dims[k + 1] = a->dimSize[k];
/* reshape tensor a form (..., n_I, ..., n_J, ...) => (..., n_I, ..., n_J, 1, ...)*/
XTensor * aTMP = new XTensor(a->order + 1, dims, a->dataType, a->denseRatio, a->devID, a->mem);
aTMP->data = a->data;
for(int k = 0; k < I; k++)
dims[k] = a->dimSize[k];
for(int k = I + 1; k <= J; k++)
dims[k - 1] = a->dimSize[k];
dims[J] = a->dimSize[I];
for(int k = J + 1; k < a->order; k++)
dims[k] = a->dimSize[k];
/* reshape tensor b form (..., m_I, ..., m_J, ...) => (..., m_J, m_I, ...) */
b->Reshape(b->order, dims);
/* tensor (..., n_I, ..., n_J, 1, ...) => tensor (..., m_J, m_I, ...) */
_Merge(aTMP, b, J + 1, I);
memcpy(dims, a->dimSize, sizeof(int) * a->order);
dims[I] = a->dimSize[J];
dims[J] = a->dimSize[I];
/* reshape tensor b form (..., m_J, m_I, ...) => (..., m_J, ..., m_I, ...) => */
b->Reshape(b->order, dims);
aTMP->data = NULL;
delete[] dims;
delete aTMP;
}
}
}
......@@ -27,27 +27,25 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define transpose _Transpose_
/*
generate a transposed 1D/2D tensor
tensor transposition of dimensions i and j
b = transposed(a)
*/
void _Transpose(XTensor * a, XTensor * b);
void _Transpose(const XTensor * a, XTensor * b, const int i, const int j);
/*
transpose a 1D/2D tensor (do it on site).
tensor transposition of dimensions i and j (do this on site)
keep the result in the input tensor and return nothing.
a = transposed(a)
*/
void _TransposeMe(XTensor * a);
void _TransposeMe(XTensor * a, const int i, const int j);
/*
make a transposed 1D/2D tensor (return a XTensor structure).
tensor transposition of dimensions i and j (return a XTensor structure).
make a new tensor to keep the result and return it.
b = transposed(a)
*/
XTensor Transpose(XTensor &a);
XTensor Transpose(const XTensor &a, const int i, const int j);
} // namespace nts(NiuTrans.Tensor)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论