kill the unneccessary allocatoin on devices that are never be used

197fac6d · xiaotong · 860980dd · 197fac6d · 197fac6d
Commit 197fac6d authored Sep 14, 2018 by xiaotong
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -41,6 +41,7 @@ XDevManager GDevs;
 XDevice::XDevice()
 {
    stream = NULL;
+    isInitialized = false;
    Clear();
 #ifdef USE_CUDA
@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
 #endif
    }
+    isInitialized = true;
 }
 /* clear it */
@@ -152,6 +154,9 @@ void XDevice::Clear()
 /* get cublas handle */
 cublasHandle_t * XDevice::GetCublasHandle()
 {
+    if (!isInitialized)
+        Init(devID);
    if(!isHandleReady){
        MUTEX_LOCK(cublasMutex);
        int devIDBackup = 0;
@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
 /* get the stream of cuda */
 cudaStream_t * XDevice::GetCudaStream()
 {
+    if (!isInitialized)
+        Init(devID);
    CheckNTErrors(stream != NULL, "the stream is not initialized!");
    return &stream->stream;
@@ -279,33 +287,13 @@ void XDevManager::Init()
        exit(1);
    }
-    cudaDeviceProp prop[64];
    for(int i = 0; i < GPUCount; i++){
-        GPUs[i].Init(i);
+        GPUs[i].devID = i;
-        cudaGetDeviceProperties(&prop[i], i);
+        //GPUs[i].Init(i);
    }
-#ifdef USA_CUDA_P2P
-    for(int i = 0; i < GPUCount; i++){
-        cudaSetDevice(i);
-        for(int j = 0; j < GPUCount; j++){
-            if(i == j)
-                continue;
-            int access;
-            cudaDeviceCanAccessPeer(&access, i, j);
-            bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
-            fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
-            if(access != 0){
-                CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
-                CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
-            }
-        }
-    }
-#endif
 #endif
    nGPU = GPUCount;
 }
@@ -351,6 +339,9 @@ into blocks
 */
 int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);
@@ -402,6 +393,9 @@ into blocks
 */
 int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -67,6 +67,9 @@ public:
    /* warp size of an (Navida) GPU */
    int GPUWarpSize;
+    /* indicates whether the device class has been initialized */
+    bool isInitialized;
    /* 
    max grid size (or number of blocks) of an (Navida) GPU 
    NOTE: the grid size is alone with three dimensions (x, y, z)