Merge with branch xiaotong.

4a3a47f1 · hello · 69589a2c · 4a3a47f1 · 4a3a47f1 · 4a3a47f1
Commit 4a3a47f1 authored Feb 24, 2021 by hello
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -162,6 +162,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
    }
    else{
        node->visitMark = NODE_FINISHED;
+        node->isGradFinished = true;
    }
 }


--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
@@ -21,8 +21,8 @@

 #include "Decoder.h"
 #include "Utility.h"
-#include "module/LayerNorm.h"
-#include "module/CommonModules.h"
+#include "submodel/LayerNorm.h"
+#include "submodel/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace nmt

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
@@ -21,8 +21,8 @@

 #include "Encoder.h"
 #include "Utility.h"
-#include "module/LayerNorm.h"
-#include "module/CommonModules.h"
+#include "submodel/LayerNorm.h"
+#include "submodel/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace nmt

--- a/source/sample/transformer/Encoder.h
+++ b/source/sample/transformer/Encoder.h
@@ -23,10 +23,10 @@
 #define __ENCODER_H__

 #include "Utility.h"
-#include "module/FNN.h"
-#include "module/Attention.h"
-#include "module/Embedding.h"
-#include "module/LayerNorm.h"
+#include "submodel/FNN.h"
+#include "submodel/Attention.h"
+#include "submodel/Embedding.h"
+#include "submodel/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

--- a/source/sample/transformer/Model.h
+++ b/source/sample/transformer/Model.h
@@ -24,10 +24,10 @@

 #include "Encoder.h"
 #include "Decoder.h"
-#include "module/FNN.h"
-#include "module/Output.h"
+#include "submodel/FNN.h"
+#include "submodel/Output.h"
 #include "Utility.h"
-#include "module/Attention.h"
+#include "submodel/Attention.h"

 namespace nmt
 {

--- a/source/sample/transformer/module/Attention.cpp
+++ b/source/sample/transformer/module/Attention.cpp
--- a/source/sample/transformer/module/Attention.h
+++ b/source/sample/transformer/module/Attention.h
--- a/source/sample/transformer/module/CommonModules.cpp
+++ b/source/sample/transformer/module/CommonModules.cpp
--- a/source/sample/transformer/module/CommonModules.h
+++ b/source/sample/transformer/module/CommonModules.h
--- a/source/sample/transformer/module/Embedding.cpp
+++ b/source/sample/transformer/module/Embedding.cpp
--- a/source/sample/transformer/module/Embedding.h
+++ b/source/sample/transformer/module/Embedding.h
--- a/source/sample/transformer/module/FNN.cpp
+++ b/source/sample/transformer/module/FNN.cpp
--- a/source/sample/transformer/module/FNN.h
+++ b/source/sample/transformer/module/FNN.h
--- a/source/sample/transformer/module/GLU.cpp
+++ b/source/sample/transformer/module/GLU.cpp
--- a/source/sample/transformer/module/GLU.h
+++ b/source/sample/transformer/module/GLU.h
--- a/source/sample/transformer/module/LayerHistory.cpp
+++ b/source/sample/transformer/module/LayerHistory.cpp
--- a/source/sample/transformer/module/LayerHistory.h
+++ b/source/sample/transformer/module/LayerHistory.h
--- a/source/sample/transformer/module/LayerNorm.cpp
+++ b/source/sample/transformer/module/LayerNorm.cpp
--- a/source/sample/transformer/module/LayerNorm.h
+++ b/source/sample/transformer/module/LayerNorm.h
--- a/source/sample/transformer/module/NNUtil.cpp
+++ b/source/sample/transformer/module/NNUtil.cpp
--- a/source/sample/transformer/module/NNUtil.h
+++ b/source/sample/transformer/module/NNUtil.h
--- a/source/sample/transformer/module/Output.cpp
+++ b/source/sample/transformer/module/Output.cpp
--- a/source/sample/transformer/module/Output.h
+++ b/source/sample/transformer/module/Output.h
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
@@ -98,6 +98,21 @@ public:
                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
                         size_t minSentBatch, size_t batchSize, int devID);

+    /* load the samples into the buffer (a list) */
+    bool LoadBatchToBuf(XList * buf);
+
+    /* load the samples into tensors from the buffer */
+    static
+    bool LoadBatch(XList * buf,
+                   XTensor* batchEnc, XTensor* paddingEnc,
+                   XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                   size_t minSentBatch, size_t batchSize, int devID,
+                   int &wc, int &sc);
+
+    /* release the samples in a buffer */
+    static
+    void ClearSamples(XList * buf);
+
    /* initialization function */
    void Init(const char* dataFile, int bucketSize, bool training);


--- a/source/sample/transformer/translate/Predictor.cpp
+++ b/source/sample/transformer/translate/Predictor.cpp
@@ -22,7 +22,7 @@
 #include <iostream>

 #include "Predictor.h"
-#include "../module/NNUtil.h"
+#include "../submodel/NNUtil.h"

 using namespace nts;


--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -42,7 +42,6 @@ XDevManager GDevs;
 /* constructor */
 XDevice::XDevice()
 {
-    stream = NULL;
    isInitialized = false;
    Clear();

@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
        }
        else
            sprintf(name2, "GPU-%d %s", devID, name);
-
-        stream = new XStream(0, devID);
 #endif
    }

@@ -176,10 +173,6 @@ void XDevice::Clear()
        curandDestroyGenerator(gen);
        isGenReady = false;
    }
-    if (stream != NULL) {
-        delete stream;
-        stream = NULL;
-    }
 #endif
    isInitialized = false;
 }
@@ -227,17 +220,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
    return &cublasHandle;
 }

-/* get the stream of cuda */
-cudaStream_t * XDevice::GetCudaStream()
-{
-    if (!isInitialized)
-        Init(devID);
-
-    CheckNTErrors(stream != NULL, "the stream is not initialized!");
-
-    return &stream->stream;
-}
-
 #endif // USE_CUDA

 /* switch to a device */
@@ -286,6 +268,28 @@ int XDevice::GetGPUDevice()
 #endif
 }

+/* 
+swith to a device (CPU or GPU) 
+>> devID - device id
+*/
+void XDevice::SetDevice(int devID)
+{
+    if(devID >= 0)
+        SetGPUDevice(devID);
+}
+
+/* 
+swith to a device (CPU or GPU) with a backup of the device id 
+>> devID - device id
+>> backupDevID - backup of the device id
+*/
+void XDevice::SetDevice(int devID, int &backupDevID)
+{
+    backupDevID = GetGPUDevice();
+    if (devID >= 0)
+        SetGPUDevice(devID);
+}
+
 /* reset cuda flag for more efficient cuda execution. It should be called after "SetGPUDevice" when
   no GPU context has been established. */
 void XDevice::SetFastFlags()
@@ -312,13 +316,6 @@ void XDevice::SetFastFlagsAllDevices()
 #endif
 }

-/* delete the default stream for the device */
-void XDevice::DelDeviceStream()
-{
-    if(stream != NULL)
-        delete stream;
-}
-
 /* constructor */
 XDevManager::XDevManager()
 {
@@ -391,14 +388,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }

-/* get the stream of a given GPU */
-cudaStream_t * XDevManager::GetCudaStream(const int devID)
-{
-    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
-
-    return GPUs[devID].GetCudaStream();
-}
-
 #endif

 /* 
@@ -620,16 +609,5 @@ char * XDevManager::GetDevString(int devID)
    }
 }

-/* delete the streams for all devices */
-void XDevManager::DelDeviceStream()
-{
-    for(int i = 0; i < GDevs.nCPU; i++) {
-        GDevs.CPUs[i].DelDeviceStream();
-    }
-    for(int i = 0; i < GDevs.nGPU; i++) {
-        GDevs.GPUs[i].DelDeviceStream();
-    }
-}
-
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -25,7 +25,6 @@
 #define __XDEVICE_H__

 #include "XThread.h"
-#include "XStream.h"

 #ifdef USE_CUDA

@@ -97,9 +96,6 @@ public:
    /* specify whether Unified Virtual Address Space (UVA) is supported */
    bool isUVASupported;

-    /* default stream for the device */
-    XStream * stream;
-
    /* seed for random number generation */
    int seed;
    
@@ -140,12 +136,9 @@ public:
 #ifdef USE_CUDA
    /* get cublas handle */
    cublasHandle_t * GetCublasHandle();
-
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream();
 #endif

-    /* switch to a device */
+    /* switch to a GPU device */
    static
    void SetGPUDevice(int devID);

@@ -153,10 +146,18 @@ public:
    static
    void SetGPUDeviceFast(int devID);

-    /* switch to a get current dev */
+    /* get current dev */
    static
    int GetGPUDevice();

+    /* swith to a device (CPU or GPU) */
+    static
+    void SetDevice(int devID);
+
+    /* swith to a device (CPU or GPU) with a backup of the device id */
+    static
+    void SetDevice(int devID, int &backupDevID);
+
    /* reset cuda flag for more efficient cuda execution */
    static
    void SetFastFlags();
@@ -164,9 +165,6 @@ public:
    /* reset cuda flag for more efficient cuda execution (all devices) */
    static
    void SetFastFlagsAllDevices();
-
-    /* delete the default stream for the device (call it before deleting the XDevice) */
-    void DelDeviceStream();
 };

 /*
@@ -206,9 +204,6 @@ public:
 #ifdef USE_CUDA
    /* get the handle of GPU */
    cublasHandle_t * GetCudaHandle(const int devID);
-
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream(const int devID);
 #endif

    /* get grid and block sizes that max potential */
@@ -228,10 +223,6 @@ public:

    /* get the device information in string */
    char * GetDevString(int devID);
-
-    /* delete the streams for all devices */
-    static
-    void DelDeviceStream();
 };

 /* managing the devices */

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -26,8 +26,6 @@
 #ifndef __XLINK_H__
 #define __XLINK_H__

-#include "XGlobal.h"
-
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* cross reference */

--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
+void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,13 +195,12 @@ void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepT
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);

            /* the arguments that are passed to the function */
-            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
+            XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);

            /* thread */
            XThread * thread  = threads + availableThreads[i];

-            thread->argv = args;
-            thread->function = function;
+            thread->SetFunc(function, args);

            MUTEX_LOCK(thread->workingMutex);
            thread->working = 1;

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();

    /* run a set of jobs in parallel */
-    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
+    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);

    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new TensorList(1);
+    args = new XList(1);
 }

 /* de-constructor */
@@ -67,12 +67,9 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new TensorList(1);
+    jobDequeuerArgs = new XList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
-    jobStream = NULL;
-    jobStream1 = NULL;
-    jobStream2 = NULL;
    
    MUTEX_INIT(enqueueMutex);
    MUTEX_INIT(dequeueMutex);
@@ -85,9 +82,6 @@ XQueue::~XQueue()
 {
    delete[] queue;
    delete jobDequeuerArgs;
-    delete jobStream;
-    delete jobStream1;
-    delete jobStream2;

    //if(isJobQueue)
    //    StopJobConsumer();
@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
    while(runningJobCount > 0){
        XSleep(10);
    }
-
-    if(jobStream != NULL){
-        CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
-        jobStream->Clear();
-    }
-    if(jobStream1 != NULL){
-        CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
-        jobStream1->Clear();
-    }
-    if(jobStream2 != NULL){
-        CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
-        jobStream2->Clear();
-    }
 }

 int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -189,12 +170,11 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();

-    // warning: this may cause unknown error
-    jobDequeuerArgs->Add((XTensor*)this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
+    /* warning: this may cause unknown errors */
+    jobDequeuerArgs->Add(this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (devids + jobDevID) : &cpuid);

-    jobDequeuer.function = (TFunction)DequeueJobs;
-    jobDequeuer.argv = jobDequeuerArgs;
+    jobDequeuer.SetFunc((TFunction)DequeueJobs, jobDequeuerArgs);

    jobDequeuer.Start();
    jobDequeuer.LetItGo();
@@ -213,7 +193,7 @@ void XQueue::StopJobConsumer()
 }

 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
+void XQueue::EnqueueJob(void * job, XList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -227,17 +207,15 @@ void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 }

 /* job item consumer */
-void XQueue::DequeueJobs(TensorList * args)
+void XQueue::DequeueJobs(XList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");

    XQueue * q = (XQueue*)args->GetItem(0);
    int devID = *(int*)args->GetItem(1);

-    int devIDBackup = XDevice::GetGPUDevice();
-
-    if(devID >= 0)
-        XDevice::SetGPUDevice(devID);
+    int devIDBackup = -1;
+    XDevice::SetDevice(devID, devIDBackup);

    while(1){
        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
@@ -258,8 +236,7 @@ void XQueue::DequeueJobs(TensorList * args)

    }

-    if(devID >= 0)
-        XDevice::SetGPUDevice(devIDBackup);
+    XDevice::SetDevice(devIDBackup);
 }

 /* get the break flag */
@@ -268,31 +245,10 @@ bool XQueue::GetJobBreak()
    return jobDequeuerBreak;
 }

-/* get job stream */
-XStream * XQueue::GetJobStream(int n)
-{
-    if(n == 0)
-        return jobStream;
-    else if(n == 1)
-        return jobStream1;
-    else if(n == 2)
-        return jobStream2;
-    else{
-        ShowNTErrors("invalid stream id!");
-    }
-
-    return NULL;
-}
-
-/* make job streams */
-void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
+/* get the number of jobs */
+int XQueue::GetJobNum()
 {
-    if(devID != INVALID_DEVICE_ID)
-        jobStream = new XStream(0, devID);
-    if(devID1 != INVALID_DEVICE_ID)
-        jobStream1 = new XStream(0, devID1);
-    if(devID2 != INVALID_DEVICE_ID)
-        jobStream2 = new XStream(0, devID2);
+    return runningJobCount;
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -33,7 +33,6 @@

 #include "XGlobal.h"
 #include "XThread.h"
-#include "XStream.h"
 #include "XDevice.h"
 #include "XList.h"

@@ -52,7 +51,7 @@ public:
    void * job;

    /* arguments of the job */
-    TensorList * args;
+    XList * args;

 public:
    /* constructor */
@@ -102,7 +101,7 @@ private:
    XThread jobDequeuer;

    /* argument list of jobDequeuer */
-    TensorList * jobDequeuerArgs;
+    XList * jobDequeuerArgs;

    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -110,11 +109,6 @@ private:
    /* running job count */
    int runningJobCount;

-    /* job streams (we think that three streams is enough :)) */
-    XStream * jobStream;
-    XStream * jobStream1;
-    XStream * jobStream2;
-
 public:
    /* constuctor */
    XQueue(int mySize = MAX_QUEUE_SIZE);
@@ -135,26 +129,23 @@ public:
    void WaitForEmptyJobQueue();

    /* run the job consumer */
-    void RunJobConsumer(int jobDevID = 0);
+    void RunJobConsumer(int jobDevID = -1);

    /* stop the job consumer */
    void StopJobConsumer();

    /* add a job item to process */
-    void EnqueueJob(void * job, TensorList * jobArgs);
+    void EnqueueJob(void * job, XList * jobArgs);

    /* job item consumer */
    static
-    void DequeueJobs(TensorList * args);
+    void DequeueJobs(XList * args);

    /* get the break flag */
    bool GetJobBreak();

-    /* get job stream */
-    XStream * GetJobStream(int n = 0);
-
-    /* make job streams */
-    void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
+    /* get the number of jobs */
+    int GetJobNum();
 };

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-
-#include "stdio.h"
-#include "stdlib.h"
-#include "XGlobal.h"
-#include "XStream.h"
-#include "XDevice.h"
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asynchronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-
-/* constructor */
-XStream::XStream(int priority, int myDevID, int myMaxEventNum)
-{
-    devID = myDevID;
-#ifdef USE_CUDA
-    if(myDevID >= 0){
-        int backupDevID = XDevice::GetGPUDevice();
-        XDevice::SetGPUDevice(myDevID);
-        events = new cudaEvent_t[myMaxEventNum];
-        XDevice::SetGPUDevice(backupDevID);
-
-        maxEventNum = myMaxEventNum;
-        usedEventNum = 0;
-    }
-    else{
-        maxEventNum = 0;
-        usedEventNum = 0;
-    }
-#endif
-
-    Create(priority, devID);
-}
-
-/* deconstructor */
-XStream::~XStream()
-{
-    Destroy();
-#ifdef USE_CUDA
-    delete[] events;
-#endif
-}
-
-/* create the stream */
-void XStream::Create(int priority, int myDevID)
-{
-    if(myDevID < 0)
-        return;
-
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(myDevID);
-    //cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
-    CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess), 
-                  "cannot create the cuda stream!");
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-    devID = myDevID;
-}
-
-/* destroy the stream */
-void XStream::Destroy()
-{
-    if(devID < 0)
-        return;
-
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    cudaStreamDestroy(stream);
-    XDevice::SetGPUDevice(backupDevID);
-    Clear();
-#endif
-}
-
-/* clear it */
-void XStream::Clear()
-{
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    for(int i = 0; i < usedEventNum; i++){
-        cudaEventDestroy(events[i]);
-    }
-    usedEventNum = 0;
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-}
-
-/* judge if all the jobs in the stream have been finished */
-bool XStream::IsFinished()
-{
-#ifdef USE_CUDA
-    if(cudaStreamQuery(stream) == cudaSuccess)
-        return true;
-    else
-        return false;
-#else
-    return true;
-#endif
-}
-
-void XStream::StreamSynchronize()
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaStreamSynchronize(stream);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-
-void XStream::ThreadSynchronize()
-{
-#ifdef USE_CUDA
-#if CUDART_VERSION < 10000
-    cudaThreadSynchronize();
-#else
-    ShowNTErrors("TODO!");
-#endif
-#endif
-}
-
-void XStream::DeviceSynchronize(int devID)
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    cudaGetDevice(&devIDBackup);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaDeviceSynchronize();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-
-/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-void XStream::MakeDependency(XStream * precedingStream)
-{
-#ifdef USE_CUDA
-    cudaEvent_t * e = precedingStream->MakeEvent();
-    cudaEventRecord(*e, precedingStream->stream);
-    cudaStreamWaitEvent(stream, *e, 0);
-#endif
-}
-
-
-/* get the stream */
-#ifdef USE_CUDA
-inline cudaStream_t * XStream::Get()
-{
-    return &stream;
-}
-
-/* make a event */
-inline cudaEvent_t * XStream::MakeEvent()
-{
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
-    cudaEvent_t * e = events + usedEventNum++;
-    cudaEventCreate(e);
-    XDevice::SetGPUDevice(backupDevID);
-    return e;
-}
-#endif
-
-} /* end of the nts (NiuTrans.Tensor) namespace */
-
--- a/source/tensor/XStream.h
+++ b/source/tensor/XStream.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-
-#ifndef __XSTREAM_H__
-#define __XSTREAM_H__
-
-/* the CUDA stuff */
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#endif
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
-#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
-
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asychronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-class XStream
-{
-public:
-#ifdef USE_CUDA
-    /* the cuda stream */
-    cudaStream_t stream;
-
-    /* list of cuda events for synchronize different streams */
-    cudaEvent_t * events;
-
-    /* max number of the events */
-    int maxEventNum;
-
-    /* number of used events */
-    int usedEventNum;
-#else
-    /* virtual pointer */
-    void * stream;
-#endif
-
-
-    /* device that holds the stream */
-    int devID;
-
-public:
-    /* constructor */
-    XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
-
-    /* deconstructor */
-    ~XStream();
-
-    /* create the stream */
-    void Create(int priority = 0, int devID = 0);
-
-    /* destroy the stream */
-    void Destroy();
-
-    /* clear it */
-    void Clear();
-
-    /* judge if all the jobs in the stream have been finished */
-    bool IsFinished();
-
-    /* stream synchronize */
-    void StreamSynchronize();
-
-    /* thread synchronize */
-    static 
-    void ThreadSynchronize();
-
-    /* device synchronize */
-    static 
-    void DeviceSynchronize(int devID);
-
-    /* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-    void MakeDependency(XStream * precedingStream);
-
-#ifdef USE_CUDA
-    /* get the stream */
-    cudaStream_t * Get();
-
-    /* make a event */
-    cudaEvent_t * MakeEvent();
-#endif  
-};
-
-} /* end of the nts (NiuTrans.Tensor) namespace */
-
-#endif
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -89,10 +89,6 @@ XTensor::XTensor()
    Init();

    id = MakeTensorID();
-    isDefaultDType = true;
-    isInGlobalMem = false;
-    isInit = false;
-    isTmp = false;
    reserved = 0;
 }

@@ -277,6 +273,7 @@ void XTensor::Init()
    isTmp = false;
    isGrad = false;
    isVar = false;
+    isGradFinished = false;
    enableGrad = X_ENABLE_GRAD;
    visitMark = 0;
    grad = NULL;
@@ -772,10 +769,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
 }

 /* 
-a vector with all entries of 0 
->> stream - stream for the job pipeline
+a tensor with all entries of 0 
 */
-void XTensor::SetZeroAll(XStream* stream)
+void XTensor::SetZeroAll()
 {
    if(data == NULL)
        return;
@@ -788,12 +784,7 @@ void XTensor::SetZeroAll(XStream* stream)
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);
-
-            if(stream == NULL)
-                cudaMemset(data, 0, size);
-            else
-                cudaMemsetAsync(data, 0, size, stream->stream);
-            
+            cudaMemset(data, 0, size);
            cudaSetDevice(devIDBackup);
 #endif
        }
@@ -807,13 +798,8 @@ void XTensor::SetZeroAll(XStream* stream)
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
-            cudaSetDevice(devID);
-            
-            if(stream == NULL)
-                cudaMemset(data, 0, unitNum * unitSize);
-            else
-                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
-            
+            cudaSetDevice(devID);     
+            cudaMemset(data, 0, unitNum * unitSize);
            cudaSetDevice(devIDBackup);
 #endif
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -31,7 +31,6 @@
 #include <math.h>
 #include "XGlobal.h"
 #include "XPRunner.h"
-#include "XStream.h"
 #include "XHeap.h"
 #include "XList.h"
 #include "XDataType.h"
@@ -157,6 +156,11 @@ public:
    /* mark for traversing the gragh */
    unsigned int visitMark;

+    /* indicates whether the gradient of the tensor has been computed (in the backward process) 
+       Note that the indicator could be modified by XNet (in back propagation) and be accessed
+       in XTrainer (and related classes). */
+    bool isGradFinished;
+
    /* gradient (for back-propagation) */
    XTensor * grad;
    
@@ -303,7 +307,7 @@ public:
    MTYPE GetOffset3D(int d0, int d1, int d2) const;

    /* a tensor with all entries of 0 */
-    void SetZeroAll(XStream * stream = NULL);
+    void SetZeroAll();

    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);

--- a/source/tensor/XThread.cpp
+++ b/source/tensor/XThread.cpp
@@ -38,7 +38,7 @@ XThread::XThread()
 #endif
    MUTEX_INIT(gMutex);
    function = NULL;
-    argv = NULL;
+    argv.Clear();
    toBreak = false;
    jobCount = 0;
    working = 0;
@@ -69,6 +69,18 @@ void * XThread::Wrapper(void * ptr)
    return 0;
 }

+/* 
+initialize the thread with the function and its parameters 
+>> myFunc - the function to run
+>> myArgv - arguments of the function
+*/
+void XThread::SetFunc(TFunction myFunc, XList * myArgv)
+{
+    function = myFunc;
+    argv.Clear();
+    argv.AddList(myArgv);
+}
+

 /* 
 Tunning for this thread. It is very very native implementation.
@@ -77,6 +89,10 @@ After that, we wait again if there is no new job.
 */
 void XThread::Run()
 {
+    if (function == NULL) {
+        ShowNTErrors("You are running a thread with no function specified!");
+    }
+
 #ifdef _WIN32
    //COND_RESET(gCond);
 #endif    
@@ -104,7 +120,7 @@ void XThread::Run()
        }

        /* do what you want to do*/
-        function(argv);
+        function(&argv);

 #ifdef USE_PTHREAD
        jobCount--;

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{

 #endif

-typedef void (*TFunction) (volatile TensorList*);
+typedef void (*TFunction) (volatile XList*);

 /*
 This is a class that wraps the standard implementation of threading
@@ -128,12 +128,10 @@ public:

 public:
    /* function to run */
-    volatile
    TFunction function;

    /* arguments (for the function to run) */
-    volatile
-    TensorList * argv;
+    XList argv;

    /* a flag to break */
    volatile
@@ -154,6 +152,9 @@ public:
    /* a wrapper for the start-routine parameter in pthread_create */
    static void * Wrapper(void * ptr);

+    /* initialize the thread with the function and its parameters */
+    void SetFunc(TFunction myFunc, XList * myArgv);
+
    /* 
    Core of the thread. It is very very native impelementation.
    We loop and wait for a singnal to activate the job processing.

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
 #endif
 }

-void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
-{
-    if (t == s)
-        return;
-
-    if (devIDT < 0 && devIDS < 0) {
-        for(int i = 0; i < n; i++)
-            memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
-        return;
-    }
-#ifdef USE_CUDA
-    else{
-        CheckNTErrors(stream != NULL, "No stream found!");
-        cudaStream_t &cstream = stream->stream;
-        if (devIDT >= 0 && devIDS < 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
-            }
-        }
-        else if (devIDT < 0 && devIDS >= 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
-            }
-        }
-        else {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
-            if (error != cudaSuccess) {
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
-            }
-        }
-    }
-#else
-    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-}
-
 void * XMemAlloc(int devID, size_t size)
 {
    void * p = NULL;

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
 extern void XMemSet(int devID, void * p, int value, size_t size);
 extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
 extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
-extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
 extern void * XMemAlloc(int devID, size_t size);
 extern void * XMemAllocOnDev(int devID, size_t size);
 extern void XMemFree(int devID, void * p);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
 >> alpha - a coefficient
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
->> stream - the string for creating the job pipeline
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                  XTensor * c, DTYPE alpha, DTYPE beta,
-                  XPRunner * parallelRunner, XStream * stream)
+                  XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
        return;
    }
 #endif

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
 >> c - where we put a*b
 >> alpha - a coefficient
 >> beta - another coefficient
->> stream - the string for creating the job pipeline
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
+                      XTensor * c, DTYPE alpha, DTYPE beta)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,

        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();

-        /* !!!! might have problems */
-        if (stream != NULL)
-            cublasSetStream(*handle, stream->stream);
-
        if (beta == 0)
            c->SetZeroAll();


--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
+                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -32,9 +32,8 @@ copy s to t

 >> s - source
 >> t - target
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CopyValues(const XTensor * s, XTensor * t)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)

 #ifdef USE_CUDA
    if (s->devID >= 0 || t->devID >= 0) {
-        _CudaCopyValues(s, t, stream);
+        _CudaCopyValues(s, t);
        return;
    }
 #endif
@@ -82,9 +81,8 @@ copy s to t
 >> sLen - length of the segment
 >> t - target
 >> tBeg - beginning of the segment on the target side
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
    
 /*
 copy s to t (rename _CopyValues)
- >> s - source
- >> t - target
- >> stream - the stream for creating the job pipeline
+>> s - source
+>> t - target
 */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream)
+void CopyValues(const XTensor &s, XTensor &t)
 {
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
 }

 /*
@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> s - source
->> stream - the stream for creating the job pipeline
 << return - the copyed tensor t
 */
-XTensor CopyValues(const XTensor &s, XStream * stream)
+XTensor CopyValues(const XTensor &s)
 {
    XTensor t(&s);
    t.SetTMPFlag();

    /* call _CopyValues function */
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
        
    /* tensor connection */
    if (s.enableGrad) {

--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 copy a range of elements from a source vector to a target vector
 >> s - source matrix
 >> t - target matrix
->> stream - the stream for creating the job pipeline
 << return - succeed or not
 */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CudaCopyValues(const XTensor * s, XTensor * t)
 {
    CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
    CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)

    /* dense -> dense */
    if (!s->isSparse && !t->isSparse) {
-        if (stream == NULL)
-            XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
+        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
    }
    /* dense -> sparse */
    else if (!s->isSparse && t->isSparse &&
@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
        int num = s->unitNumNonZero;
        int size = sizeof(int) + num * (s->unitSize + sizeof(int));

-        if (stream == NULL)
-            XMemCopy(t->data, t->devID, s->data, s->devID, size);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
-
+        XMemCopy(t->data, t->devID, s->data, s->devID, size);
+        
        t->unitNumNonZero = num;
    }
    else {

--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* copy all elements from a source matrix to a target matrix */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CudaCopyValues(const XTensor * s, XTensor * t);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -27,19 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy s to t */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, XTensor * t);

 /* copy a segment of s to t  */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);

 /* copy s to t (rename _CopyValues) */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL);
+void CopyValues(const XTensor &s, XTensor &t);

 /* 
 copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
+XTensor CopyValues(const XTensor &s);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[t->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)s->data + k * sStep, sPitch, s->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)s->data + k * sStep, sPitch, s->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    else {
@@ -321,27 +307,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[big->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XTensor * t = (XTensor*)smalls->GetItem(k);
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)big->data + k * sStep, sPitch, big->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XTensor * t = (XTensor*)smalls->GetItem(k);
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)big->data + k * sStep, sPitch, big->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */

--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
@@ -51,7 +51,7 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    CheckNTErrors(jobNum != 0, "TODO!");

    /* argument list of the jobs */
-    TensorList * jobArgList = new TensorList(argNum);
+    XList * jobArgList = new XList(argNum);

    va_list ap;
    va_start(ap, argNum);
@@ -62,8 +62,8 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    va_end(ap);

    /* prepare the neccesary argument list for parallel processing */
-    TensorList * jobs = new TensorList(jobNum);
-    TensorList * args = new TensorList(jobNum);
+    XList * jobs = new XList(jobNum);
+    XList * args = new XList(jobNum);

    int * indexList = new int[jobNum * 4 * 4];

@@ -78,7 +78,7 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    */
    for (int i = 0; i < jobNum; i++) {
        IntList* indexArgs = new IntList(4);
-        TensorList * blockArgs = new TensorList(argNum);
+        XList * blockArgs = new XList(argNum);
        int * blockIndex = indexList + i * 4;

        indexArgs->Add(blockIndex[0]);
@@ -89,10 +89,10 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
        for (int j = 0; j < argNum; j++)
            blockArgs->Add(jobArgList->GetItem(j));

-        args->Add((XTensor*)indexArgs);
-        args->Add((XTensor*)blockArgs);
+        args->Add((void*)indexArgs);
+        args->Add((void*)blockArgs);

-        jobs->Add((XTensor*)job);
+        jobs->Add((void*)job);
    }

    args->count = jobNum * 2;

--- a/source/trainer/XTrainer.cpp
+++ b/source/trainer/XTrainer.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
+*
+*/
+
+#include "XTrainer.h"
+
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+
+/* constructor */
+XTrainer::XTrainer()
+{
+}
+
+/* de-constructor */
+XTrainer::~XTrainer()
+{
+}
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/trainer/XTrainer.h
+++ b/source/trainer/XTrainer.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This class organizes the training process of neural models, e.g., nmt and lm models
+* Distributed training is supported.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
+* I start coding in 2021 after one year since I typed last C code.
+* BUT i was a GOOD tex writter in 2020 :)
+*/
+
+#ifndef __XTRAINER_H__
+#define __XTRAINER_H__
+
+#include "../network/XNet.h"
+#include "../tensor/XQueue.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+Training of neural networks with gradient methods. Here we suppose that we 
+are training NLP models. The routine could be:
+   
+1). initialize all we need
+2). data preparation
+3). loop until convergence
+    a). read a batch of samples from the input file
+    b). reset the worker
+    c). forward computation with the input
+    d). backward computation with respect to the loss
+    e). collect the gradient (neccessary when several workers are available)
+    f). update the model (on the server end)
+    g). distribute the new model to each worker
+
+Here a worker processes a batch of samples one time, and works with
+other workers independently. The server is the origanizer. It distriute
+the job to the workers and maintain the model.
+*/
+class XTrainer
+{
+private:
+
+public:
+    /* constructor */
+    XTrainer();
+
+    /* de-constructor */
+    ~XTrainer();
+};
+}
+#endif // __XTRAINER_H__
\ No newline at end of file