remove XStream

dd7a67bc · xiaotong · 158ccc4a · dd7a67bc · dd7a67bc · dd7a67bc
Commit dd7a67bc authored Feb 23, 2021 by xiaotong
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -42,7 +42,6 @@ XDevManager GDevs;
 /* constructor */
 XDevice::XDevice()
 {
-    stream = NULL;
    isInitialized = false;
    Clear();
@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
        }
        else
            sprintf(name2, "GPU-%d %s", devID, name);
-        stream = new XStream(0, devID);
 #endif
    }
@@ -176,10 +173,6 @@ void XDevice::Clear()
        curandDestroyGenerator(gen);
        isGenReady = false;
    }
-    if (stream != NULL) {
-        delete stream;
-        stream = NULL;
-    }
 #endif
    isInitialized = false;
 }
@@ -227,17 +220,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
    return &cublasHandle;
 }
-/* get the stream of cuda */
-cudaStream_t * XDevice::GetCudaStream()
-{
-    if (!isInitialized)
-        Init(devID);
-    CheckNTErrors(stream != NULL, "the stream is not initialized!");
-    return &stream->stream;
-}
 #endif // USE_CUDA
 /* switch to a device */
@@ -312,13 +294,6 @@ void XDevice::SetFastFlagsAllDevices()
 #endif
 }
-/* delete the default stream for the device */
-void XDevice::DelDeviceStream()
-{
-    if(stream != NULL)
-        delete stream;
-}
 /* constructor */
 XDevManager::XDevManager()
 {
@@ -391,14 +366,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }
-/* get the stream of a given GPU */
-cudaStream_t * XDevManager::GetCudaStream(const int devID)
-{
-    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
-    return GPUs[devID].GetCudaStream();
-}
 #endif
 /* 
@@ -620,16 +587,5 @@ char * XDevManager::GetDevString(int devID)
    }
 }
-/* delete the streams for all devices */
-void XDevManager::DelDeviceStream()
-{
-    for(int i = 0; i < GDevs.nCPU; i++) {
-        GDevs.CPUs[i].DelDeviceStream();
-    }
-    for(int i = 0; i < GDevs.nGPU; i++) {
-        GDevs.GPUs[i].DelDeviceStream();
-    }
-}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -25,7 +25,6 @@
 #define __XDEVICE_H__
 #include "XThread.h"
-#include "XStream.h"
 #ifdef USE_CUDA
@@ -97,9 +96,6 @@ public:
    /* specify whether Unified Virtual Address Space (UVA) is supported */
    bool isUVASupported;
-    /* default stream for the device */
-    XStream * stream;
    /* seed for random number generation */
    int seed;
@@ -140,9 +136,6 @@ public:
 #ifdef USE_CUDA
    /* get cublas handle */
    cublasHandle_t * GetCublasHandle();
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream();
 #endif
    /* switch to a device */
@@ -164,9 +157,6 @@ public:
    /* reset cuda flag for more efficient cuda execution (all devices) */
    static
    void SetFastFlagsAllDevices();
-    /* delete the default stream for the device (call it before deleting the XDevice) */
-    void DelDeviceStream();
 };
 /*
@@ -206,9 +196,6 @@ public:
 #ifdef USE_CUDA
    /* get the handle of GPU */
    cublasHandle_t * GetCudaHandle(const int devID);
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream(const int devID);
 #endif
    /* get grid and block sizes that max potential */
@@ -228,10 +215,6 @@ public:
    /* get the device information in string */
    char * GetDevString(int devID);
-    /* delete the streams for all devices */
-    static
-    void DelDeviceStream();
 };
 /* managing the devices */

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -70,9 +70,6 @@ XQueue::XQueue(int mySize)
    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
-    jobStream = NULL;
-    jobStream1 = NULL;
-    jobStream2 = NULL;
    MUTEX_INIT(enqueueMutex);
    MUTEX_INIT(dequeueMutex);
@@ -85,9 +82,6 @@ XQueue::~XQueue()
 {
    delete[] queue;
    delete jobDequeuerArgs;
-    delete jobStream;
-    delete jobStream1;
-    delete jobStream2;
    //if(isJobQueue)
    //    StopJobConsumer();
@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
    while(runningJobCount > 0){
        XSleep(10);
    }
-    if(jobStream != NULL){
-        CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
-        jobStream->Clear();
-    }
-    if(jobStream1 != NULL){
-        CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
-        jobStream1->Clear();
-    }
-    if(jobStream2 != NULL){
-        CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
-        jobStream2->Clear();
-    }
 }
 int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -268,31 +249,4 @@ bool XQueue::GetJobBreak()
    return jobDequeuerBreak;
 }
-/* get job stream */
-XStream * XQueue::GetJobStream(int n)
-{
-    if(n == 0)
-        return jobStream;
-    else if(n == 1)
-        return jobStream1;
-    else if(n == 2)
-        return jobStream2;
-    else{
-        ShowNTErrors("invalid stream id!");
-    }
-    return NULL;
-}
-/* make job streams */
-void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
-{
-    if(devID != INVALID_DEVICE_ID)
-        jobStream = new XStream(0, devID);
-    if(devID1 != INVALID_DEVICE_ID)
-        jobStream1 = new XStream(0, devID1);
-    if(devID2 != INVALID_DEVICE_ID)
-        jobStream2 = new XStream(0, devID2);
-}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -33,7 +33,6 @@
 #include "XGlobal.h"
 #include "XThread.h"
-#include "XStream.h"
 #include "XDevice.h"
 #include "XList.h"
@@ -110,11 +109,6 @@ private:
    /* running job count */
    int runningJobCount;
-    /* job streams (we think that three streams is enough :)) */
-    XStream * jobStream;
-    XStream * jobStream1;
-    XStream * jobStream2;
 public:
    /* constuctor */
    XQueue(int mySize = MAX_QUEUE_SIZE);
@@ -149,12 +143,6 @@ public:
    /* get the break flag */
    bool GetJobBreak();
-    /* get job stream */
-    XStream * GetJobStream(int n = 0);
-    /* make job streams */
-    void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
 };
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-#include "stdio.h"
-#include "stdlib.h"
-#include "XGlobal.h"
-#include "XStream.h"
-#include "XDevice.h"
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asynchronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-/* constructor */
-XStream::XStream(int priority, int myDevID, int myMaxEventNum)
-{
-    devID = myDevID;
-#ifdef USE_CUDA
-    if(myDevID >= 0){
-        int backupDevID = XDevice::GetGPUDevice();
-        XDevice::SetGPUDevice(myDevID);
-        events = new cudaEvent_t[myMaxEventNum];
-        XDevice::SetGPUDevice(backupDevID);
-        maxEventNum = myMaxEventNum;
-        usedEventNum = 0;
-    }
-    else{
-        maxEventNum = 0;
-        usedEventNum = 0;
-    }
-#endif
-    Create(priority, devID);
-}
-/* deconstructor */
-XStream::~XStream()
-{
-    Destroy();
-#ifdef USE_CUDA
-    delete[] events;
-#endif
-}
-/* create the stream */
-void XStream::Create(int priority, int myDevID)
-{
-    if(myDevID < 0)
-        return;
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(myDevID);
-    //cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
-    CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess), 
-                  "cannot create the cuda stream!");
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-    devID = myDevID;
-}
-/* destroy the stream */
-void XStream::Destroy()
-{
-    if(devID < 0)
-        return;
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    cudaStreamDestroy(stream);
-    XDevice::SetGPUDevice(backupDevID);
-    Clear();
-#endif
-}
-/* clear it */
-void XStream::Clear()
-{
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    for(int i = 0; i < usedEventNum; i++){
-        cudaEventDestroy(events[i]);
-    }
-    usedEventNum = 0;
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-}
-/* judge if all the jobs in the stream have been finished */
-bool XStream::IsFinished()
-{
-#ifdef USE_CUDA
-    if(cudaStreamQuery(stream) == cudaSuccess)
-        return true;
-    else
-        return false;
-#else
-    return true;
-#endif
-}
-void XStream::StreamSynchronize()
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaStreamSynchronize(stream);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-void XStream::ThreadSynchronize()
-{
-#ifdef USE_CUDA
-#if CUDART_VERSION < 10000
-    cudaThreadSynchronize();
-#else
-    ShowNTErrors("TODO!");
-#endif
-#endif
-}
-void XStream::DeviceSynchronize(int devID)
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    cudaGetDevice(&devIDBackup);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaDeviceSynchronize();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-void XStream::MakeDependency(XStream * precedingStream)
-{
-#ifdef USE_CUDA
-    cudaEvent_t * e = precedingStream->MakeEvent();
-    cudaEventRecord(*e, precedingStream->stream);
-    cudaStreamWaitEvent(stream, *e, 0);
-#endif
-}
-/* get the stream */
-#ifdef USE_CUDA
-inline cudaStream_t * XStream::Get()
-{
-    return &stream;
-}
-/* make a event */
-inline cudaEvent_t * XStream::MakeEvent()
-{
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
-    cudaEvent_t * e = events + usedEventNum++;
-    cudaEventCreate(e);
-    XDevice::SetGPUDevice(backupDevID);
-    return e;
-}
-#endif
-} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XStream.h
+++ b/source/tensor/XStream.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-#ifndef __XSTREAM_H__
-#define __XSTREAM_H__
-/* the CUDA stuff */
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#endif
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asychronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-class XStream
-{
-public:
-#ifdef USE_CUDA
-    /* the cuda stream */
-    cudaStream_t stream;
-    /* list of cuda events for synchronize different streams */
-    cudaEvent_t * events;
-    /* max number of the events */
-    int maxEventNum;
-    /* number of used events */
-    int usedEventNum;
-#else
-    /* virtual pointer */
-    void * stream;
-#endif
-    /* device that holds the stream */
-    int devID;
-public:
-    /* constructor */
-    XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
-    /* deconstructor */
-    ~XStream();
-    /* create the stream */
-    void Create(int priority = 0, int devID = 0);
-    /* destroy the stream */
-    void Destroy();
-    /* clear it */
-    void Clear();
-    /* judge if all the jobs in the stream have been finished */
-    bool IsFinished();
-    /* stream synchronize */
-    void StreamSynchronize();
-    /* thread synchronize */
-    static 
-    void ThreadSynchronize();
-    /* device synchronize */
-    static 
-    void DeviceSynchronize(int devID);
-    /* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-    void MakeDependency(XStream * precedingStream);
-#ifdef USE_CUDA
-    /* get the stream */
-    cudaStream_t * Get();
-    /* make a event */
-    cudaEvent_t * MakeEvent();
-#endif  
-};
-} /* end of the nts (NiuTrans.Tensor) namespace */
-#endif
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -772,10 +772,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
 }
 /* 
-a vector with all entries of 0 
+a tensor with all entries of 0 
->> stream - stream for the job pipeline
 */
-void XTensor::SetZeroAll(XStream* stream)
+void XTensor::SetZeroAll()
 {
    if(data == NULL)
        return;
@@ -788,12 +787,7 @@ void XTensor::SetZeroAll(XStream* stream)
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);
+            cudaMemset(data, 0, size);
-            if(stream == NULL)
-                cudaMemset(data, 0, size);
-            else
-                cudaMemsetAsync(data, 0, size, stream->stream);
            cudaSetDevice(devIDBackup);
 #endif
        }
@@ -807,13 +801,8 @@ void XTensor::SetZeroAll(XStream* stream)
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
-            cudaSetDevice(devID);
+            cudaSetDevice(devID);     
+            cudaMemset(data, 0, unitNum * unitSize);
-            if(stream == NULL)
-                cudaMemset(data, 0, unitNum * unitSize);
-            else
-                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
            cudaSetDevice(devIDBackup);
 #endif
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -31,7 +31,6 @@
 #include <math.h>
 #include "XGlobal.h"
 #include "XPRunner.h"
-#include "XStream.h"
 #include "XHeap.h"
 #include "XList.h"
 #include "XDataType.h"
@@ -303,7 +302,7 @@ public:
    MTYPE GetOffset3D(int d0, int d1, int d2) const;
    /* a tensor with all entries of 0 */
-    void SetZeroAll(XStream * stream = NULL);
+    void SetZeroAll();
    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
 #endif
 }
-void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
-{
-    if (t == s)
-        return;
-    if (devIDT < 0 && devIDS < 0) {
-        for(int i = 0; i < n; i++)
-            memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
-        return;
-    }
-#ifdef USE_CUDA
-    else{
-        CheckNTErrors(stream != NULL, "No stream found!");
-        cudaStream_t &cstream = stream->stream;
-        if (devIDT >= 0 && devIDS < 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
-            }
-        }
-        else if (devIDT < 0 && devIDS >= 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
-            }
-        }
-        else {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
-            if (error != cudaSuccess) {
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
-            }
-        }
-    }
-#else
-    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-}
 void * XMemAlloc(int devID, size_t size)
 {
    void * p = NULL;

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
 extern void XMemSet(int devID, void * p, int value, size_t size);
 extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
 extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
-extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
 extern void * XMemAlloc(int devID, size_t size);
 extern void * XMemAllocOnDev(int devID, size_t size);
 extern void XMemFree(int devID, void * p);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
 >> alpha - a coefficient
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
->> stream - the string for creating the job pipeline
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                  XTensor * c, DTYPE alpha, DTYPE beta,
-                  XPRunner * parallelRunner, XStream * stream)
+                  XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
        return;
    }
 #endif

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
 >> c - where we put a*b
 >> alpha - a coefficient
 >> beta - another coefficient
->> stream - the string for creating the job pipeline
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
+                      XTensor * c, DTYPE alpha, DTYPE beta)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
-        /* !!!! might have problems */
-        if (stream != NULL)
-            cublasSetStream(*handle, stream->stream);
        if (beta == 0)
            c->SetZeroAll();

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
+                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -32,9 +32,8 @@ copy s to t
 >> s - source
 >> t - target
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CopyValues(const XTensor * s, XTensor * t)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 #ifdef USE_CUDA
    if (s->devID >= 0 || t->devID >= 0) {
-        _CudaCopyValues(s, t, stream);
+        _CudaCopyValues(s, t);
        return;
    }
 #endif
@@ -82,9 +81,8 @@ copy s to t
 >> sLen - length of the segment
 >> t - target
 >> tBeg - beginning of the segment on the target side
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
 /*
 copy s to t (rename _CopyValues)
- >> s - source
+>> s - source
- >> t - target
+>> t - target
- >> stream - the stream for creating the job pipeline
 */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream)
+void CopyValues(const XTensor &s, XTensor &t)
 {
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
 }
 /*
@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> s - source
->> stream - the stream for creating the job pipeline
 << return - the copyed tensor t
 */
-XTensor CopyValues(const XTensor &s, XStream * stream)
+XTensor CopyValues(const XTensor &s)
 {
    XTensor t(&s);
    t.SetTMPFlag();
    /* call _CopyValues function */
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
    /* tensor connection */
    if (s.enableGrad) {

--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 copy a range of elements from a source vector to a target vector
 >> s - source matrix
 >> t - target matrix
->> stream - the stream for creating the job pipeline
 << return - succeed or not
 */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CudaCopyValues(const XTensor * s, XTensor * t)
 {
    CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
    CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
    /* dense -> dense */
    if (!s->isSparse && !t->isSparse) {
-        if (stream == NULL)
+        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
-            XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
    }
    /* dense -> sparse */
    else if (!s->isSparse && t->isSparse &&
@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
        int num = s->unitNumNonZero;
        int size = sizeof(int) + num * (s->unitSize + sizeof(int));
-        if (stream == NULL)
+        XMemCopy(t->data, t->devID, s->data, s->devID, size);
-            XMemCopy(t->data, t->devID, s->data, s->devID, size);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
        t->unitNumNonZero = num;
    }
    else {

--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* copy all elements from a source matrix to a target matrix */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CudaCopyValues(const XTensor * s, XTensor * t);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -27,19 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy s to t */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, XTensor * t);
 /* copy a segment of s to t  */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);
 /* copy s to t (rename _CopyValues) */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL);
+void CopyValues(const XTensor &s, XTensor &t);
 /* 
 copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
+XTensor CopyValues(const XTensor &s);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[t->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)s->data + k * sStep, sPitch, s->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)s->data + k * sStep, sPitch, s->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    else {
@@ -321,27 +307,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[big->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XTensor * t = (XTensor*)smalls->GetItem(k);
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)big->data + k * sStep, sPitch, big->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XTensor * t = (XTensor*)smalls->GetItem(k);
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)big->data + k * sStep, sPitch, big->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */