/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * 
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-5-25
 *
 */

#include <string.h>
#include <stdlib.h>
#include <time.h>
#include "XGlobal.h"
#include "XUtility.h"
#include "XMem.h"

/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
    
//int testxmemid = 0;
//void * recordp = NULL;

/*
for managing the memories
*/
XMemManager GMems;

XMem * GMem;

/* constructor */
XMem::XMem()
{
    memset(this, 0, sizeof(XMem));
    devID = -1;
    mode = UNI_FREE;
    curBlockPin = -1;
    indexOffset = -1;
    name = new char[64];
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
    isInitialized = false;
}

/* 
constructor 
>> myDevID - device id 
             -1:  CPU memory
             >=0: GPU device ID
>> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of a memory block
>> myBlockNum  - number of memory blocks
>> myBufSize - size of buffer
*/
XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, MTYPE myBufSize)
{
    memset(this, 0, sizeof(XMem));
    curBlockPin = -1;
    indexOffset = -1;
    name = new char[64];
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
    Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
}

/* deconstructor */
XMem::~XMem()
{
#ifdef USE_CUDA
    int devIDBackup = -1;
    cudaGetDevice(&devIDBackup);
    SetDevice(devID);

    if(devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    curandDestroyGenerator(randGen);

    SetDevice(devIDBackup);
#endif
    Free();
    delete[] name;
    delete[] memIndex;
    delete[] memIndex2;
    delete[] minSizeIndex;
}

/* 
initialize it 
>> myDevID - device id 
             -1:  CPU memory
             >=0: GPU device ID
>> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
>> myBlockSize - size of a memory block
>> myBlockNum  - number of memory blocks
>> myBufSize - size of buffer
*/
void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, MTYPE myBufSize)
{
    Free();

    CheckNTErrors((myBlockSize > 0 && myBlockNum > 0), "Illegal member block settings!");

    devID = myDevID;
    mode = myMode;
    maxBlockSize = myBlockSize;
    blockNum = myBlockNum;

    blocks = new XMemBlock[blockNum];
    for(int i = 0; i < blockNum; i++){
        blocks[i].mem = NULL;
        blocks[i].size = maxBlockSize;
        blocks[i].sizeDesired = maxBlockSize;
        blocks[i].used = 0;
    }

    curBlock = blocks;
    curBlockID = 0;
    finalBlockID = 0;

    if(myDevID < 0){
        buf = new char[(unsigned int)myBufSize];
    }
    else{
#ifdef USE_CUDA
        int devIDBackup = -1;
        cudaGetDevice(&devIDBackup);
        SetDevice(myDevID);

        CheckNTErrors(cudaMalloc((void **)&buf, myBufSize) == cudaSuccess, "Cannot allocate the memory.");
        CheckNTErrors(cudaMemset(buf, 0, myBufSize) == cudaSuccess, "Cannot update the memory.");
        CheckNTErrors(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_DEFAULT) == CURAND_STATUS_SUCCESS, "Cannot make the cuda random number generator!");
        CheckNTErrors(curandSetPseudoRandomGeneratorSeed(randGen, (unsigned)time(NULL)) == CURAND_STATUS_SUCCESS, "Cannot generate the seed!");

        SetDevice(devIDBackup);

        /* create the cublas handle */
        SetComputationMode(true);
#else
        ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif
    }

    bufSize = myBufSize;

#ifdef SMALL_DEVICE
    if (myMode == FREE_ON_THE_FLY)
        SetIndex(50000);
#else
    if (myMode == FREE_ON_THE_FLY)
        SetIndex(MILLION);
#endif

    signature++;
    isInitialized = true;
}

/* free memory */
void XMem::Free()
{
    for (int i = 0; i < blockNum; i++) {
        if (blocks != NULL)
            Free(devID, blocks[i].mem);
    }
    delete[] blocks;
    blocks = NULL;

    Free(devID, buf);
    buf = NULL;
    bufSize = 0;
    bufUsed = 0;

    devID = -1;
}

/* 
free a piece of memory 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mem - address of the memory block to release
*/
void XMem::Free(int myDevID, void * mem)
{
    if(mem == NULL)
        return;

    /* on CPUs */
    if(myDevID < 0){
        delete[] (char*)mem;
    }
    /* on GPUs */
    else{
#ifdef USE_CUDA
        int devIDBackup = -1;
        cudaGetDevice(&devIDBackup);
        SetDevice(myDevID);

        cudaError_t error = cudaFree((char*)mem);
        if(error != cudaSuccess){
            ShowNTErrors("Cannot free the memory.");
        }

        SetDevice(devIDBackup);
#else
        ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif
    }
}

/*
get the signature
<< return - the signature
*/
MTYPE XMem::GetSignature()
{
    return signature;
}

/* 
set the name of the memory pool 
>> myName - name of the memory pool
*/
void XMem::SetName(const char * myName)
{
    delete[] name;
    name = new char[(int)strlen(myName) + 1];
    strcpy(name, myName);
}

/* 
switch to the device where we intend to work 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
*/
void XMem::SetDevice(int myDevID)
{
    if(myDevID < 0)
        return;

#ifdef USE_CUDA
    cudaError_t error = cudaSetDevice(myDevID);

    if (error != cudaSuccess){
        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", myDevID, error, cudaGetErrorString(error));
        exit(1);
    }

#else
    ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif
}

/* 
switch to the device (with fast cuda execution mode) we intend to work on
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
*/
void XMem::SetDeviceFast(int myDevID)
{
    SetDevice(myDevID);
#ifdef USE_CUDA
    cudaError_t error = cudaSetDeviceFlags(cudaDeviceScheduleSpin|cudaDeviceLmemResizeToMax);
    if (error != cudaSuccess){
        fprintf(stderr, "Error! Calling cudaSetDeviceFlags(%d) fails(%d:%s)\n", myDevID, error, cudaGetErrorString(error));
        exit(1);
    }
#endif
}

/* 
run in the static mode
>> myIsStatic - specify if the memory allocation is static
*/
void XMem::SetStaticMode(bool myIsStatic)
{
    isStatic = myIsStatic;
}

/* 
specify if the memory pool is used for tensor computation (rather
than storage 
>> myIsForComputation - specify if the memory pool is used in computation (if
                        so we need to create some handles for calling the BLAS interfaces)
*/
void XMem::SetComputationMode(bool myIsForComputation)
{
#ifdef USE_CUDA
    int devIDBackup = -1;
    cudaGetDevice(&devIDBackup);
    SetDevice(devID);

    if(!myIsForComputation && devID >= 0 && cublasHandle != NULL)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
                      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
#endif
}

/*
initialize the index
>> indexSize - size of the index
>> minSizeFirst - minimal size allocation for the first entry
>> minSizeNum - number of minimal-size index nodes
*/
void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
{
    delete[] memIndex;
    delete[] memIndex2;
    delete[] minSizeIndex;

    nodeNum = indexSize;
    nodeNumUsed = minSizeNum * 2;
    indexEntryNum = minSizeNum;
    
    memIndex = new MPieceNode[nodeNum];
    memset(memIndex, 0, sizeof(MPieceNode) * nodeNum);
    
    memIndex2 = new MPieceNode[nodeNum];
    memset(memIndex2, 0, sizeof(MPieceNode) * nodeNum);

    minSizeIndex = new MTYPE[indexEntryNum];
    memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);

    minSizeIndex[0] = minSizeFirst;
    for(int i = 1; i < indexEntryNum; i++)
        minSizeIndex[i] = minSizeIndex[i - 1] * 2;

    indexOffset = GetMSB(minSizeFirst);
}

/* get device id */
int XMem::GetDevID()
{
    return devID;
}

/* set desired memory block size */
void XMem::SetDesiredSize(int myDevID, int blockID, MTYPE mySize)
{
    CheckNTErrors((blockID >= 0 && blockID < blockNum), "Illegal block id!");
    CheckNTErrors((mySize > 0), "Illegal block size!");
    CheckNTErrors((blocks[blockID].mem == NULL), "Cannot reset a memory block that is being used!");

    blocks[blockID].sizeDesired = mySize;
    blocks[blockID].size = mySize;
}

/* 
require a piece of memory 
>> mySize - size of the require memory
*/
void * XMem::Alloc(MTYPE mySize)
{
    return Alloc(devID, mySize);
}

/* 
require a piece of memory 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
*/
void * XMem::Alloc(int myDevID, MTYPE mySize)
{
    if(mode == FREE_ON_THE_FLY)
        return AllocStandard(myDevID, mySize);
    else if(isStatic)
        return AllocStatic(myDevID, mySize);
    else
        return AllocDynamic(myDevID, mySize);
}

/* 
require a piece of memory in a dynamic manner 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
*/
void * XMem::AllocDynamic(int myDevID, MTYPE mySize)
{
    int ID;
    XMemBlock * b = NULL;
    bool firstHit = false;

    for (ID = curBlockID; ID < blockNum; ID++) {
        b = blocks + ID;
        if (!firstHit && b->size > b->used) {
            firstHit = true;
            curBlockID = ID;
            curBlock = blocks + curBlockID;
        }
        if (b->size >= b->used + mySize)
            break;
    }

    CheckNTErrors((curBlockID < blockNum), "No enough memory blocks.");
    CheckNTErrors((ID < blockNum), "Cannot find a available memory block. Please use a larger memory pool.");
    CheckNTErrors((b->size - b->used >= mySize), "Cannot allocate the memory. Please use a larger memory block!");

    if (ID > finalBlockID)
        finalBlockID = ID;

    char * mem = NULL;
    char * required = NULL;
    int backOffset = 0;

    /* allocate the memory */
    if (b->mem == NULL && b->used == 0) {
        /* on CPUs */
        if (myDevID < 0) {
            mem = new char[(unsigned int)b->size + 2 * CUDA_PITCH];
            memset(mem, 0, (unsigned int)b->size + 2 * CUDA_PITCH);
        }
        /* on GPUs */
        else {
#ifdef USE_CUDA
            int devIDBackup = -1;
            cudaGetDevice(&devIDBackup);
            SetDevice(myDevID);
            cudaError_t e = cudaMalloc((void **)&mem, b->size + 2 * CUDA_PITCH);
            if (e != cudaSuccess) {
                ShowNTErrors("Cannot allocate the memory.");
            }
            CheckNTErrors(cudaMemset(mem, 0, b->size + 2 * CUDA_PITCH) == cudaSuccess, "Cannot update the memory.");
            SetDevice(devIDBackup);
#else
            ShowNTErrors("Please specify USE_CUDA for compiling this program.");

#endif
        }

        b->mem = mem;
    }

#ifdef USE_CUDA
    if (myDevID >= 0) {
        long long address = (long long)((char*)b->mem + b->used);
        int offset = address % CUDA_PITCH;
        backOffset = offset > 0 ? CUDA_PITCH - offset : 0;
    }
#endif

    required = (char*)b->mem + b->used + backOffset;
    b->used += mySize + backOffset;

#ifdef USE_CUDA
    if (myDevID >= 0) {
        CheckNTErrors(((long long)required % CUDA_PITCH == 0), "The GPU memory is not aligned.");
    }
#endif

    CheckNTErrors((b->size + 2 * CUDA_PITCH >= b->used), "Something is wrong with the memory block.");

    return required;
}

/* 
required a piece of memory with fixed size (if possible) 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
*/
void * XMem::AllocStatic(int myDevID, MTYPE mySize)
{
    for(int ID = curBlockID; ID < blockNum; ID++){
        XMemBlock * b = blocks + ID;
        if(b->mem == NULL){
            CheckNTErrors((mySize > 0), "Illegal required memory block size!");
            CheckNTErrors((b->mem == NULL), "Incorrect memory allocation!");
            b->size = mySize;
            return AllocDynamic(myDevID, mySize);
        }
        else if(b->mem != NULL && b->size > b->used + mySize)
            return AllocDynamic(myDevID, mySize);
    }

    ShowNTErrors("Cannot find a valid memory block!");

    return NULL;
}

/* 
require a piece of memory that is not in the memory pool 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
*/
void * XMem::AllocGlobal(int myDevID, MTYPE mySize)
{
    return XMemAllocOnDev(myDevID, (unsigned int)mySize);
}

/* get the available size of the memory that can be used */
MTYPE XMem::GetAvailableSize(int myDevID)
{
    return curBlock->size - curBlock->used;
}

/* 
require a piece of memory in the buffer
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
>> pitch - pitch for aligned memory 
<< return - the head pointer of the required memory
*/
void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
{
    MTYPE backOffset = 0;

    if(pitch > 1){
        MTYPE address = (MTYPE)((char*)buf + bufUsed);
        int offset  = address % pitch;
        backOffset = offset > 0 ? pitch - offset : 0;
    }

    if((bufSize - bufUsed < mySize)){
        XPRINT1(0, stderr, "Cannot allocate the memory (%s). Please specify a larger buffer in XMem!", name);
        exit(1);
    }

    char * required = (char*)buf + bufUsed + backOffset;
    bufUsed += mySize + backOffset;

    CheckNTErrors((bufSize >= bufUsed), "Something is wrong with the memory block.");

    return required;
}

/* 
release a piece of memory 
>> p - pointer to the memory piece we intend to release
>> size - size of the memory piece to release
>> code - code the memory 
*/
void XMem::Release(void * p, MTYPE size, MTYPE code)
{
    if(code == signature)
        Release(devID, p, size);
}

/* 
release a piece of memory 
>> myDevID - device id
>> p - pointer to the memory piece we intend to release
>> size - size of the memory piece to release
*/
void XMem::Release(int myDevID, void * p, MTYPE size)
{
    if(mode == FREE_ON_THE_FLY)
        ReleaseStandard(myDevID, p, size);
}

/* 
release a piece of memory in the buffer
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
>> pitch - pitch for aligned memory 
*/
void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
{
    CheckNTErrors((bufUsed >= mySize), 
                  "Cannot allocate the memory. Please specify a larger buffer in XMem!");

    MTYPE backOffset = 0;

    if(pitch > 1){
        MTYPE address = (MTYPE)((char*)buf + (bufUsed - mySize));
        backOffset  = address % pitch;
    }

    bufUsed -= (mySize + backOffset);
}

/* 
free a piece of memory that is not in the memory pool 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> p - the pointer to the address of the memory we intend to free
*/
void XMem::ReleaseGlobal(int myDevID, void * p)
{
    XMemFreeOnDev(myDevID, p);
}

/* 
allocate a piece of memory as "malloc" 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> mySize - size of the require memory
>> myIsRebuiltIndex - indicates whether the index has been just rebuilt
<< return - index
*/
void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
{
    CheckNTErrors(memIndex != NULL, "The index of the memory pool is not initialized!");

    if(mySize <= minSizeIndex[0])
        mySize = minSizeIndex[0];

    int index = FindIndexEntry(mySize);
    MPieceNode * entry = NULL;
    MPieceNode * node = NULL;
    MPieceNode * hit = NULL;
    void * result = NULL;

    /* search for the memory piece avialable for the allocation */
    for(int i = index; i <= indexEntryNum; i++){
        if(i == indexEntryNum){
            entry = memIndex + index;
            CheckNTErrors(mySize >= minSizeIndex[index], "Wrong index!");
        }
        else
            entry = memIndex + i;
        
        node = entry->next;
        while(node != NULL){
            if(node->size == 0){
                MPieceNode * next = node->next;
                RemoveIndexNode(node, entry);
                node = next;
            }
            else{
                if(node->size >= mySize){
                    hit = node;
                    break;
                }
                node = node->next;
            }
        }

        if(hit != NULL)
            break;
    }
    
    /* if a free memory piece is found, we allocate the memory on it. */
    if(hit != NULL){
        MHeader * head = &hit->head;
        char * beg = (char*)GetPitchedAddress((char*)hit->p, MY_PITCH);
        char * end = (char*)beg + mySize;
        MTYPE needed = end - (char*)hit->p;
        MTYPE remaining = head->size - needed;
        
        if(remaining >= minSizeIndex[0]){

            /* make a new index node */
            MPieceNode * newNode = memIndex + nodeNumUsed++;
            newNode->head.indexNode = newNode;
            newNode->p = end;
            newNode->pReal = NULL;
            newNode->size = (char*)end + remaining -
                            (char*)GetPitchedAddress((char*)end, MY_PITCH);
            
            AddFreeIndexNode(newNode);
            
            /* connections for headers */
            MHeader &cur = hit->head;
            MHeader &next = newNode->head;
            next.pre = &cur;
            next.next = cur.next;
            cur.next = &next;
            cur.size = needed;
            
            if(next.next != NULL)
                next.next->pre = &next;
            
            next.state = 1;
            next.size = remaining;
            next.blockID = cur.blockID;
        }
        
        hit->size = mySize;
        hit->head.state = 2;
        hit->pReal = beg;
        blocks[hit->head.blockID].used += head->size;
        
        RemoveIndexNode(hit);
        AddAllocIndexNode(hit);
        
        result = beg;
    }
    else{
        /* if no free memory piece is available, we rebuild the index and merge small fragments
           to make bigger free memory pieces. */
        if(!myIsRebuiltIndex){
            RebuildIndex();
            result = AllocStandard(myDevID, mySize, true);
        }
        /* if there is still no available memory piece, we have to obtain a new block of memory. */
        else{
            int bi;
            for(bi = 0; bi < blockNum; bi++){
                XMemBlock * block = blocks + bi;
                if (block->mem != NULL && (block->head != NULL || block->size < mySize + 2 * MY_PITCH))
                    continue;
                
                if (block->mem == NULL) {
                    block->size = MAX(block->sizeDesired, mySize + 2 * MY_PITCH);
                    if (myDevID < 0) {
                        block->mem = new char[block->size];
                        memset(block->mem, 0, block->size);
                    }
                    else {
#ifdef USE_CUDA
                        int devIDBackup = -1;
                        cudaGetDevice(&devIDBackup);
                        SetDevice(myDevID);
                        cudaError_t e = cudaMalloc((void **)&block->mem, block->size);
                        if (e != cudaSuccess) {
                            ShowNTErrors("Cannot allocate the memory.");
                        }
                        CheckNTErrors(cudaMemset(block->mem, 0, block->size) == cudaSuccess, "Cannot update the memory.");
                        SetDevice(devIDBackup);
#else
                        ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif
                    }
                }

                curBlockID = MAX(curBlockID, bi);
                    
                /* make a new index node */
                MPieceNode * newNode = memIndex + nodeNumUsed++;
                newNode->head.indexNode = newNode;
                newNode->p = block->mem;
                newNode->pReal = NULL;
                //newNode->size = (char*)block->mem + block->size -
                //                (char*)GetPitchedAddress(block->mem, MY_PITCH);
                newNode->size = mySize;
                    
                AddFreeIndexNode(newNode);
                    
                MHeader &header = newNode->head;
                header.state = 1;
                header.size = block->size;
                header.pre = NULL;
                header.next = NULL;
                header.blockID = bi;
                    
                block->head = &header;
                block->used = 0;
                    
                result = AllocStandard(myDevID, mySize, myIsRebuiltIndex);
                break;
            }
            CheckNTErrors(bi < blockNum, "No enough memory is available!");
        }
    }

    /* if all index nodes are used, we rebuild the index to release the nodes that are free */
    if(nodeNumUsed == nodeNum){
        RebuildIndex();
        CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
    }

    /*if(testxmemid == 30){
        recordp = result;
    }

    if(curBlockID >= 25){
        MHeader * head = blocks[25].head;
        while(head != NULL){
            fprintf(stderr, "head: %ld %ld\n", head->indexNode->pReal, head->indexNode->size);
            head = head->next;
        }
    }

    if(testxmemid == 32){
        int nnn = 0;
    }

    if(recordp != NULL){
        MTYPE size = mySize;
        if(size <= minSizeIndex[0])
            size = minSizeIndex[0];
    
        MPieceNode * entry = NULL;
        MPieceNode * node = NULL;
        MPieceNode * hit = NULL;
        MPieceNode * last = NULL;
    
        entry = memIndex + indexEntryNum + FindIndexEntry(size);
    
        last = entry;
        node = entry->next;
    
        while(node != NULL){
            CheckNTErrors(node->pre == last, "Something is wrong!");
            CheckNTErrors(last->next == node, "Something is wrong!");
            CheckNTErrors(node->head.state == 2, "Something is wrong!");
            last = node;
        
            if(node->size == 0){
                MPieceNode * next = node->next;
                RemoveFreeIndexNode(node, entry);
                node = next;
                ShowNTErrors("Something is wrong!");
            }
            else{
                CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
                if(node->pReal == recordp){
                    hit = node;
                    break;
                }
                node = node->next;
            }
        }

        if(hit == NULL){
            int nnn = 0;
        }
    }*/

    return result;
}

/* 
find the highest set bit (or most significant set bit) in an integer-64 
>> mySize - required size
<< return - the position of MSB
*/
int XMem::GetMSB(MTYPE mySize)
{
    MTYPE value = mySize;

    int result = 0;
    if(value){
        if(0xFFFFFFFF00000000&value){value>>=(1<<5); result|=(1<<5);}
        if(0x00000000FFFF0000&value){value>>=(1<<4); result|=(1<<4);}
        if(0x000000000000FF00&value){value>>=(1<<3); result|=(1<<3);}
        if(0x00000000000000F0&value){value>>=(1<<2); result|=(1<<2);}
        if(0x000000000000000C&value){value>>=(1<<1); result|=(1<<1);}
        if(0x0000000000000002&value){result|=(1<<0);}
    }
    else
        result = -1;

    return result;
}

/* 
find the index entry for allocation query 
>> mySize - required size
<< return - index
*/
int XMem::FindIndexEntry(MTYPE mySize)
{
    CheckNTErrors(minSizeIndex != NULL && indexOffset >= 0, 
                 "The index of the memory pool is not initialized!");

    if(mySize <= minSizeIndex[0])
        mySize = minSizeIndex[0];

    int index = GetMSB(mySize) - indexOffset;

    if(index >= indexEntryNum)
        index = indexEntryNum - 1;

    return index;
}

/* 
remove an index node
>> node - node to remove
>> - the entry of the list that keeps the node
*/
void XMem::RemoveIndexNode(MPieceNode * node, MPieceNode * entry)
{
    MPieceNode * pre = node->pre;
    MPieceNode * next = node->next;
    

    CheckNTErrors(pre != NULL, "cannot free the entry node!");

    pre->next = next;
    if(next != NULL)
        next->pre = pre;
    
    node->pre = NULL;
    node->next = NULL;
}

/* 
add an index node for available memory pieces
>> node - node to add
>> entry - the entry of the list to append the node
*/
void XMem::AddFreeIndexNode(MPieceNode * node, MPieceNode * entry)
{
    MPieceNode * entryForMe = entry != NULL ? entry :
                              memIndex + FindIndexEntry(node->size);

    /*MPieceNode * backup = entryForMe->next;

    while(backup != NULL && backup->head.size < node->head.size){
        backup = backup->next;
        entryForMe = entryForMe->next;
    }

    entryForMe->next = node;
    node->pre = entryForMe;
    node->next = backup;
    if(backup != NULL)
        backup->pre = node;*/
    
    MPieceNode * backup = entryForMe->next;
    entryForMe->next = node;
    node->pre = entryForMe;
    node->next = backup;
    if(backup != NULL)
        backup->pre = node;

    CheckNTErrors(node != node->next, "Something wrong with the index node!");
    CheckNTErrors(node != node->pre,  "Something wrong with the index node!");
}
    
/*
remove an index node for memory pieces in use
>> node - node to remove
>> - the entry of the list that keeps the node
*/
void XMem::RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry)
{
    RemoveIndexNode(node, entry);
}

/*
add an index node for memory pieces in use
>> node - node to add
>> entry - the entry of the list to append the node
*/
void XMem::AddAllocIndexNode(MPieceNode * node, MPieceNode * entry)
{
    MPieceNode * entryForMe = entry != NULL ? entry :
                              memIndex + indexEntryNum + FindIndexEntry(node->size);
    
    MPieceNode * backup = entryForMe->next;
    entryForMe->next = node;
    node->pre = entryForMe;
    node->next = backup;
    if(backup != NULL)
        backup->pre = node;
    
    CheckNTErrors(node != node->next, "Something wrong with the index node!");
    CheckNTErrors(node != node->pre,  "Something wrong with the index node!");
}

/* 
release a piece of memory as "free" 
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
>> p - the pointer to the address of the memory we intend to free
>> size - size of the memory piece to release
*/
void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
{
    if(p == NULL)
        return;
    
    if(size <= minSizeIndex[0])
        size = minSizeIndex[0];
    
    MPieceNode * entry = NULL;
    MPieceNode * node = NULL;
    MPieceNode * hit = NULL;
    MPieceNode * last = NULL;
    
    entry = memIndex + indexEntryNum + FindIndexEntry(size);
    
    last = entry;
    node = entry->next;
    
    while(node != NULL){
        CheckNTErrors(node->pre == last, "Something is wrong!");
        CheckNTErrors(last->next == node, "Something is wrong!");
        CheckNTErrors(node->head.state == 2, "Something is wrong!");
        last = node;
        
        if(node->size == 0){
            MPieceNode * next = node->next;
            RemoveIndexNode(node, entry);
            node = next;
            ShowNTErrors("Something is wrong!");
        }
        else{
            CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
            if(node->pReal == p){
                hit = node;
                break;
            }
            node = node->next;
        }
    }
    
    CheckNTErrors(hit != NULL, "No header is found!");
    
    hit->head.state = 1;
    
    RemoveAllocIndexNode(hit);

    MTYPE usedSize = (char*)hit->p + hit->head.size - (char*)GetPitchedAddress((char*)hit->p, MY_PITCH);
    blocks[hit->head.blockID].used -= usedSize;

    if(mergeFreeOTF){
        MHeader * head = &hit->head;
        MHeader * pre = head->pre;
        MHeader * next = head->next;
        bool mergeLeft = false;
        bool mergeRight = false;

        CheckNTErrors(head != pre, "wrong list of memory headers");
        CheckNTErrors(head != next, "wrong list of memory headers");

        if(pre != NULL && pre->state == 1 && pre->blockID == head->blockID){
            mergeLeft = true;
            head->pre = pre->pre;
            if(head->pre != NULL)
                head->pre->next = head;
            hit->p = pre->indexNode->p;
            hit->head.size += pre->size;
            RemoveAllocIndexNode(pre->indexNode);

            if(pre == blocks[head->blockID].head)
                blocks[head->blockID].head = head;
        }

        if(next != NULL && next->state == 1 && next->blockID == head->blockID){
            mergeRight = true;
            head->next = next->next;
            if(head->next != NULL)
                head->next->pre = head;
            hit->head.size += next->size;
            RemoveAllocIndexNode(next->indexNode);
        }

        if(!mergeLeft && !mergeRight){
            hit->size = usedSize;
        }
        else{
            hit->size = (char*)hit->p + hit->head.size - (char*)GetPitchedAddress((char*)hit->p, MY_PITCH);
        }
    }
    else{
        hit->size = usedSize;
    }

    AddFreeIndexNode(hit);
}

/* rebuild index to merge small fragments of memory and free the block with no use */
void XMem::RebuildIndex()
{
    int nodeNumUsed2 = indexEntryNum * 2;
    memset(memIndex2, 0, sizeof(MPieceNode) * indexEntryNum * 2);

    for(int bi = 0; bi <= curBlockID; bi++){
        XMemBlock * block = blocks + bi;
        if(block->mem == NULL || block->head == NULL)
            continue;

        MHeader * head = block->head;
        CheckNTErrors(head->size <= block->size, "Illegal memory block!");
        
        block->head = NULL;
        block->used = 0;

        /* if the block is not used, we delete it */
        if(head->state == 1 && head->size == block->size){
            if(devID < 0){
                delete[] (char*)block->mem;
            }
            else{
#ifdef USE_CUDA
                int devIDBackup = -1;
                cudaGetDevice(&devIDBackup);
                SetDevice(devID);
                CheckNTErrors(cudaFree((char*)block->mem) == cudaSuccess, "Cannot free the memory.");
                SetDevice(devIDBackup);
#else
                ShowNTErrors("Please specify USE_CUDA for compiling this program.");
#endif 
            }

            block->size = 0;
            block->mem = NULL;
        }
        else{
            /* if the block is in use, we build the index */
            int pieceCount = 0;
            MTYPE size = 0;
            MHeader * newLast = NULL;
            while(head != NULL){
                MHeader * next = head->next;
                if(head->state == 1){
                    while(next != NULL && next->state == 1){
                        head->size += next->size;
                        next = next->next;
                    }
                    head->next = next;
                }
                
                MPieceNode * node = head->indexNode;
                void * p = node->p;
                
                /* make a new index node */
                MPieceNode * newNode = memIndex2 + nodeNumUsed2++;
                newNode->p = p;
                
                if(head->state == 1){
                    newNode->size = (char*)p + head->size -
                                    (head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
                }
                else
                    newNode->size = node->size;
                
                newNode->pre = NULL;
                newNode->next = NULL;
                
                CheckNTErrors(newNode->size > 0, "Illegal index node!");
                
                MHeader * newHeader = &newNode->head;
                
                newHeader->indexNode = newNode;
                newHeader->pre = newLast;
                newHeader->next = NULL;
                newHeader->blockID = bi;
                newHeader->size = head->size;
                newHeader->state = head->state;
                
                if(newLast != NULL)
                    newLast->next = newHeader;
                
                if(head->state == 1){
                    newNode->pReal = NULL;
                    MPieceNode * entry = memIndex2 + FindIndexEntry(newNode->size);
                    AddFreeIndexNode(newNode, entry);
                }
                else{
                    newNode->pReal = head->indexNode->pReal;
                    MPieceNode * entry = memIndex2 + indexEntryNum + FindIndexEntry(newNode->size);
                    AddAllocIndexNode(newNode, entry);
                    block->used += head->size;
                }
                
                if(newLast == NULL)
                    block->head = newHeader;
                
                pieceCount++;
                size += head->size;
                CheckNTErrors(size <= block->size, "Illegal block size!");
                
                newLast = newHeader;
                head = next;
            }
        }
    }
    
    MPieceNode * backup = memIndex2;
    memIndex2 = memIndex;
    memIndex = backup;    
    nodeNumUsed = nodeNumUsed2;
}

/* 
reset the memory pool  
>> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
*/
void XMem::Reset(int myDevID)
{
    for(int i = 0; i <= curBlockID; i++){
        if(devID >= 0){
#ifdef USE_CUDA
            CheckNTErrors(cudaFree(blocks[i].mem) == cudaSuccess, "Cannot free the memory.");
#else
            ShowNTErrors("We need cuda code here!");
#endif
        }
        else
            delete[] (char*)blocks[i].mem;

        blocks[i].mem = NULL;
        blocks[i].used = 0;
        blocks[i].size = blocks[i].sizeDesired;
    }

    curBlockID = 0;
    curBlock = blocks;
    curBlock->used = 0;
    finalBlockID = 0;
    bufUsed = 0;
}

/* 
get pitch for aligned memory
>> baseAddress - where the allocated memory starts
>> mySize - size of the require memory
<< return - the actual size required for aligned memory
*/
MTYPE XMem::GetPitch(int myDevID, MTYPE baseAddress, MTYPE mySize)
{
    long long address = baseAddress + mySize;
    int offset  = address % CUDA_PITCH;
    int backOffset = offset > 0 ? CUDA_PITCH - offset : 0;
    return mySize + backOffset;
}

/* 
get pitched address for aligned memory 
>> address - the starting address
>> pitch - as it is
<< return - pitched address
*/
void * XMem::GetPitchedAddress(void * address, MTYPE pitch)
{
    MTYPE p = (MTYPE)address;
    MTYPE offset  = p % pitch;
    MTYPE backOffset = offset > 0 ? pitch - offset : 0;
    return (char*)address + backOffset;
}

/* get current address (for use) */
void * XMem::GetAddress()
{
    if(curBlock->mem == NULL)
        Alloc(devID, 0);

    return curBlock->mem;
}

/* clear it */
void XMem::Clear()
{
    if (mode == UNI_FREE) {
        for (int i = 0; i < blockNum; i++)
            blocks[i].used = 0;
        curBlock = blocks;
        curBlockID = 0;
    }
    else if (mode == FREE_ON_THE_FLY) {
        nodeNumUsed = indexEntryNum * 2;
        memset(memIndex, 0, sizeof(MPieceNode) * indexEntryNum * 2);
        for (int i = 0; i <= curBlockID; i++) {
            blocks[i].head = NULL;
            blocks[i].used = 0;
            if (i > 0) {
                blocks[i].size = blocks[i].sizeDesired;
                Free(devID, blocks[i].mem);
                blocks[i].mem = NULL;
            }
        }
        curBlock = blocks;
        curBlockID = 0;
    }
    else {
        ShowNTErrors("Something is wrong!");
    }

    signature++;
}

/* clear the buffer */
void XMem::ClearBuf()
{
    bufUsed = 0;
}

/* clear the memory pool and the buffer */
void XMem::ClearAll()
{
    Clear();
    ClearBuf();
}

/* 
set a variable to the input value
>> tgt - where we put the value
>> src - where the value is from
>> size - data size, e.g., for a float, it is sizeof(float)
>> tgtMem - the memory pool used by the target variable
>> srcMem - the memory pool used by the source variable
>>
*/
void XMem::Copy(void * tgt, void * src, int size, XMem * tgtMem, XMem * srcMem)
{
    if(srcMem == NULL || srcMem->devID < 0){
        if(tgtMem == NULL || tgtMem->devID < 0)  // host (CPU memory)  -> host (CPU memory)
            memcpy(tgt, src, size);
#ifdef USE_CUDA
        else                                     // device (GPU memory) -> host (CPU memory)
            cudaMemcpyFromSymbol(tgt, src, size);
#endif
    }
#ifdef USE_CUDA
    else{
        if(tgtMem == NULL || tgtMem->devID < 0)  // host (CPU memory)  -> device (GPU memory)
            cudaMemcpyToSymbol(tgt, src, size);
        else                                     // device (GPU memory) -> device (GPU memory)
            cudaMemcpy(tgt, src, size, cudaMemcpyDeviceToDevice);
    }
#endif
}

/* 
set a float-typed variable to the input value 
>> tgt - where we put the value
>> src - where the value is from
>> tgtMem - the memory pool used by the target variable
>> srcMem - the memory pool used by the source variable
*/
void XMem::CopyFloat(float * tgt, float * src, XMem * tgtMem, XMem * srcMem)
{
    XMem::Copy(tgt, src, sizeof(float), tgtMem, srcMem);
}

/* 
set a variable to 0 
>> tgt - where the variable is placed
>> size - data size
>> tgtMem - the memory pool used by the variable
*/
void XMem::SetZero(void * tgt, MTYPE size, XMem * tgtMem)
{
    if(tgtMem == 0 || tgtMem->devID < 0)
        memset(tgt, 0, (unsigned int)size);
#ifdef USE_CUDA
    else
        cudaMemset(tgt, 0, size);
#endif
}

/* record the pin point */
void XMem::SetPin()
{
    CheckNTErrors((finalBlockID == curBlockID), "Cannot set pin for the memory pool. Please used a larger size of the first block!");

    curBlockPin = curBlockID;
    curUsedPin = curBlock->used;
}

/* go back to the pin point */
void XMem::BackToPin()
{
    if(curBlockPin < 0)
        return;

    for(int i = curBlockPin + 1; i <= finalBlockID; i++){

        if(devID >= 0){
#ifdef USE_CUDA
            CheckNTErrors(cudaFree(blocks[i].mem) == cudaSuccess, "Cannot free the memory.");
#else
            ShowNTErrors("We need cuda code here!");
#endif
        }
        else
            delete[] (char*)blocks[i].mem;

        blocks[i].mem = NULL;
        blocks[i].used = 0;
        blocks[i].size = blocks[i].sizeDesired;
    }

    curBlockID = curBlockPin;
    curBlock = blocks + curBlockID;
    curBlock->used = curUsedPin;
    finalBlockID = curBlockID;
}

/* record the pin point for buffer */
void XMem::SetPinBuf()
{
    bufUsedPin = bufUsed;
}

/* go back to the pin point */
void XMem::BackToPinBuf()
{
    bufUsed = bufUsedPin;
}

/* transform a size into a number (in million) */
MTYPE XMem::GetMemSize(const char * size)
{
    char * s = new char[strlen(size) + 1];
    strcpy(s, size);

    ToLowercase(s);

    int len = (int)strlen(s);

    bool ok = false;
    float num = 0;

    if (s[len-1] == 'b') {
        if (s[len-2] == 'k') {
            s[len-2] = 0;
            num = (float)atof(s);
            num /= 1024.0F;
            ok = true;
        }
        else if (s[len-2] == 'm') {
            s[len-2] = 0;
            num = (float)atof(s);
            ok = true;
        }
        else if (s[len-2] == 'g') {
            s[len-2] = 0;
            num = (float)atof(s);
            num *= 1024.0F;
            ok = true;
        }
        else if (s[len-2] >= '0' && s[len-2] <= '9') {
            s[len-1] = 0;
            num = (float)atof(s);
            ok = true;
        }
        else
            ShowNTErrors("Cannot transform the size into a number (in million)!");
    }
    else if (s[len-1] >= '0' && s[len-1] <= '9') {
        num = (float)atof(s);
        ok = true;
    }
    else
        ShowNTErrors("Cannot transform the size into a number (in million)!");

    delete[] s;

    if(ok)
        return (MTYPE)num;
    else
        return 0;
}

/* transform a size into a number (in Bytes) */
MTYPE XMem::GetMemSizeInBytes(const char * size)
{
    char * s = new char[strlen(size) + 1];
    strcpy(s, size);

    ToLowercase(s);

    int len = (int)strlen(s);

    bool ok = false;
    float num = 0;

    if (s[len-1] == 'b') {
        if (s[len-2] == 'k' || s[len-2] == 'm' || s[len-2] == 'g') {
            num = (float)GetMemSize(size) * 1024 * 1024;
            ok = true;
        }
        else if (s[len-2] >= '0' && s[len-2] <= '9') {
            s[len-1] = 0;
            num = (float)atof(s);
            ok = true;
        }
        else
            ShowNTErrors("Cannot transform the size into a number (in Bytes)!");
    }
    else if (s[len-1] >= '0' && s[len-1] <= '9') {
        num = (float)atof(s);
        ok = true;
    }
    else
        ShowNTErrors("Cannot transform the size into a number (in Bytes)!");

    delete[] s;

    if(ok)
        return (MTYPE)num;
    else
        return 0;
}

/* create a new cublas handle */
void XMem::CreateBLASHandle()
{
#ifdef USE_CUDA
    if(cublasHandle != NULL){
        CheckNTErrors(cublasDestroy(cublasHandle) == CUBLAS_STATUS_SUCCESS, 
                      "Cannot destroy the cublas handle.");
    }

    CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
                  "Cannot create the cublas handle.");
#endif
}

/* show profile of the memory pool */
void XMem::ShowMemUsage(FILE * file)
{
    MTYPE blockUsed = 0;
    MTYPE blockTotal = 0;

    for(int i = 0; i < blockNum; i++){
        if(blocks[i].mem != NULL){
            blockUsed  += blocks[i].used;
            blockTotal += blocks[i].size;
        }
    }

    MTYPE bufTotal = bufSize;
    MTYPE bufUsedTotal = bufUsed;

    fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
           (DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
    fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
            (DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsedTotal / 1024 / 1024, (DTYPE)bufUsed / bufTotal);

}

#ifdef USE_CUDA

/* get the handle of cublas */
cublasHandle_t * XMem::GetCublasHandle()
{
    return &cublasHandle;
}

#endif

/* constructor */
XMemManager::XMemManager()
{
    Initialize();
}

/* de-constructor */
XMemManager::~XMemManager()
{
}

/* get memory size */
MTYPE XMemManager::GetAvailableMemory()
{
    unsigned long freeMem = 0;
#if __APPLE__
    int mib[2] = {CTL_HW, HW_MEMSIZE};
    unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
    unsigned long long size;
    size_t len = sizeof(size);
    if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
        ShowNTErrors("Cannot get memory size on Mac!");
    }
    else{
        return size;
    }
#elif _WIN32
    MEMORYSTATUSEX memoryStatus;
    memoryStatus.dwLength = sizeof(memoryStatus);
    if (GlobalMemoryStatusEx(&memoryStatus)){
        freeMem = (unsigned long)memoryStatus.ullAvailPhys;
    }
#else
    long pages = sysconf(_SC_AVPHYS_PAGES);
    long page_size = sysconf(_SC_PAGE_SIZE);
    freeMem = pages * page_size;
#endif
    return (MTYPE)freeMem;
}

/* get GPU memory size */
MTYPE XMemManager::GetAvailableGPUMemory(int devID)
{
    size_t freeMem = 0;
    
#ifdef USE_CUDA
    size_t totalMem = 0;
    cudaSetDevice(devID);
    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
        XPRINT(0, stderr, "cannot get GPU memory information.");
        exit(1);
    }
#endif
    return (MTYPE)freeMem;
}

/* get buffer size */
void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
{
    *myBufSize = 0;
    if (freeMem >= MILLION * 128ULL){
        *myBufSize = MILLION * 32ULL;
        if (freeMem >= MILLION * 256ULL){
            *myBufSize = MILLION * 64ULL;
            if (freeMem >= MILLION * 512ULL){
                *myBufSize = MILLION * 128ULL;
                if (freeMem >= MILLION * 1024ULL) {
                    *myBufSize = MILLION * 128ULL;
                    if (freeMem >= MILLION * 2048ULL)
                        *myBufSize = MILLION * 128ULL;
                }
            }
        }
    }
} 

/* initialize it and set the global memory information */
void XMemManager::Initialize()
{
    srand((unsigned int)time(NULL));

    Free();
    
    /* CPUs (we actually do not care about how many CPUs are using) */
    nCPUMem = 1;

    /* GPUs */
    nGPUMem = 0;

#ifdef USE_CUDA
    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
        XPRINT(0, stderr, "cannot get GPU information.");
        exit(1);
    }
#endif

}

/* free it */
void XMemManager::Free()
{
    for (int i = 0; i < MAX_CPU_MEM_NUM; i++)
        CPUMems[i].Free();
    for (int i = 0; i < MAX_GPU_MEM_NUM; i++)
        GPUMems[i].Free();
}

/* get global memory pool */
XMem * XMemManager::GetMem(const int devID)
{
    XMem * mem = NULL;
    if (devID < 0){
        if(!CPUMems[0].isInitialized){
            MTYPE freeMem = GetAvailableMemory();
            MTYPE myBufSize = 0;
            GetBufferSize(freeMem, &myBufSize);
            CPUMems[0].Initialize(-1, FREE_ON_THE_FLY, 
                                  MIN_BLOCK_SIZE_FOR_MEMPOOL, 
                                  MIN_BLOCK_NUM_FOR_MEMPOOL, 
                                  myBufSize);
        }
        mem = CPUMems;
    }
    else{
        if (devID < nGPUMem){
            if(!GPUMems[devID].isInitialized){
                MTYPE freeMem = GetAvailableGPUMemory(devID);
                MTYPE myBufSize = 0;
                GetBufferSize(freeMem, &myBufSize);
                GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY, 
                                          MIN_BLOCK_SIZE_FOR_MEMPOOL, 
                                          MIN_BLOCK_NUM_FOR_MEMPOOL, 
                                          myBufSize);
            }
            mem = GPUMems + devID;
        }
        else{
            XPRINT1(0, stderr, "Please check your device id (%d)!", devID);
            ShowNTErrors("Cannot get the memory!");
        }
    }
    
    return mem;
}

/* get global memory size */
int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
{
    XMem * mem = GetMem(devID);
    int result = 0;
    if (mem != NULL){
        *myBlockSize = mem->maxBlockSize;
        *myBlockNum = mem->blockNum;
        *myBufSize = mem->bufSize;
        result = 1;
    }
    return result;
}

/* show memory information */
void XMemManager::ShowMemInfo()
{
    XPRINT(1, stderr, "Memory Information:\n");
    MTYPE myBlockSize, myBufSize;
    int myBlockNum;
    for(int i = 0; i < nCPUMem; i++){
        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
    }

    for(int i = 0; i < nGPUMem; i++){
        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
    }
}

} /* end of the nts (NiuTrans.Tensor) namespace */