refactor XList using template

72c9551b · huchi · 47c31021 · 72c9551b · 72c9551b · 72c9551b
Commit 72c9551b authored Jun 25, 2019 by huchi
--- a/source/sample/main.cpp
+++ b/source/sample/main.cpp
+#include "../../source/tensor/data/DataSet.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+#include "../tensor/core/arithmetic/MatrixMul.h"
+using namespace nts;
+void TestDataManager() {
+	DataSet dataSet("src.txt", 2, 100);
+	XTensor src, tgt;
+	enum FIELD {
+		srcField = 0,
+		tgtField = 1,
+	};
+	const int indices[] = { 0, 1 };
+	dataSet.LoadBatch(src, indices, sizeof(indices) / sizeof(*indices), srcField);
+	dataSet.LoadBatch(tgt, indices, sizeof(indices) / sizeof(*indices), tgtField);
+	//   
+	//   tgt.Dump(stderr);
+	//   src.Dump(stderr);
+	//XListV2<int> list(10);
+	//int* a = new int[10]{1,2,3,4,5,6,7,8,9};
+	//list.Add(a);
+	//auto x = list.Get(0);
+	//cout << x[0] << endl;
+	//list.Remove(0);
+	//auto y = list.Get(0);
+	//cout << x[0] << endl;
+	//delete[] a;
+	XList list(10);
+	XTensor a,b,c;
+	InitTensor2D(&a, 2, 2);
+	InitTensor2D(&b, 2, 2);
+	InitTensor2D(&c, 2, 2);
+	float arr[] = { 1., 2., 3., 4. };
+	a.SetData(arr, 4);
+	b.SetData(arr, 4);
+	//c.SetZeroAll();
+	_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
+	c.Dump(stderr);
+	CharList str(10);
+	char* s = new char(10);
+	for (int i = 0; i < 9; ++i) {
+		s[i] = i + 'a';
+	}
+	s[9] = 0;
+	str.Add(s);
+	cout << str.Get(0);
+}
+int main()
+{
+	TestDataManager();
+	return 0;
+}
\ No newline at end of file
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -478,7 +478,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, CharList * items)
 {
    items->Clear();
@@ -532,7 +532,7 @@ get device ids for the given device information
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	CharList* terms = new CharList(1);
    SplitALine(devInfo, " ", terms);
    for(int i = 0; i < terms->count; i++){

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -301,8 +301,8 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
        return;
    XList list(2);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
    MakeLink(&list, h, id);
 }
@@ -321,9 +321,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
        return;
    XList list(3);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
-    list.Add(t3);
+    list.Add((XTensor*)t3);
    MakeLink(&list, h, id);
 }

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+	// warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -40,15 +40,21 @@ argument7: matrix c (c=a*b*\alpha + c*beta)
 */
 void _MatrixMul2DMultiTheading(XList * args)
 {
-    int x1 = *(int*)args->GetItem(0);
+	CheckNTErrors(args->count == 2, "invalid argument number!");
-    int y1 = *(int*)args->GetItem(1);
+	IntList * indexArgs = (IntList*)args->GetItem(0);
-    int x2 = *(int*)args->GetItem(2);
+	XList * matrixArgs = (XList*)args->GetItem(1);
-    int y2 = *(int*)args->GetItem(3);
+	CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
-    XTensor * a = (XTensor*)args->GetItem(4);
+	CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
-    XTensor * b = (XTensor*)args->GetItem(5);
-    XTensor * c = (XTensor*)args->GetItem(6);
+    XTensor * a = matrixArgs->GetItem(0);
-    DTYPE alpha = *(DTYPE*)args->GetItem(7);
+    XTensor * b = matrixArgs->GetItem(1);
-    DTYPE beta = *(DTYPE*)args->GetItem(8);
+    XTensor * c = matrixArgs->GetItem(2);
+    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
+    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
+	int x1 = *(indexArgs->GetItem(0));
+	int y1 = *(indexArgs->GetItem(1));
+	int x2 = *(indexArgs->GetItem(2));
+	int y2 = *(indexArgs->GetItem(3));
 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -139,11 +139,11 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
    /* tensor connections */
    XList list(5);
-    list.Add(&input);
+    list.Add((XTensor*)&input);
-    list.Add(&mean);
+    list.Add((XTensor*)&mean);
-    list.Add(&var);
+    list.Add((XTensor*)&var);
-    list.Add(&a);
+    list.Add((XTensor*)&a);
-    list.Add(&b);
+    list.Add((XTensor*)&b);
    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, epsilon);

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -228,9 +228,9 @@ XTensor CopyIndexed(const XTensor & s, int dim,
    _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
    XList list(3);
-    list.Add(&s);
+    list.Add((XTensor*)&s);
-    list.Add(&srcIndex);
+    list.Add((XTensor*)&srcIndex);
-    list.Add(&tgtIndex);
+    list.Add((XTensor*)&tgtIndex);
    /* tensor connection */
    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -148,8 +148,8 @@ concatenate two tensors along a given dimension
 void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim)
 {
    XList smalls(2);
-    smalls.Add(smallA);
+    smalls.Add((XTensor*)smallA);
-    smalls.Add(smallB);
+    smalls.Add((XTensor*)smallB);
    _Concatenate(&smalls, big, dim);
 }
@@ -169,8 +169,8 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
    XList smalls(2);
-    smalls.Add(&smallA);
+    smalls.Add((XTensor*)&smallA);
-    smalls.Add(&smallB);
+    smalls.Add((XTensor*)&smallB);
    bool uniform = true;
    for (int i = 1; i < smalls.count; i++) {

--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -85,12 +85,12 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
        }
    }
    else {
-        XList * sourceArrays = new XList(smalls->count);
+        CharList * sourceArrays = new CharList(smalls->count);
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
            blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
-            sourceArrays->Add(tensor->data);
+            sourceArrays->Add((char*)tensor->data);
        }
        _MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -376,8 +376,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
    big.SetTMPFlag();
    XList smalls(2);
-    smalls.Add(&smallA);
+    smalls.Add((XTensor*)&smallA);
-    smalls.Add(&smallB);
+    smalls.Add((XTensor*)&smallB);
    /* call _Merge function */
    _Merge(&smalls, &big, whereToMerge);

--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
@@ -34,7 +34,7 @@ merge data by blocks
 >> target - target data array
 >> myMem - memory pool
 */
-void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -71,7 +71,7 @@ merge data by blocks (cuda version)
 >> target - target data array
 >> myMem - the memory pool
 */
-void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");
    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");

--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
@@ -33,7 +33,7 @@ __global__
 void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
 /* merge data by blocks (cuda version) */
-void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* merge data by blocks */
-void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -78,7 +78,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
 #endif
    }
    else {
-        XList * sourceArrays = new XList(blockNumB);
+        CharList * sourceArrays = new CharList(blockNumB);
        int * blockSizes = new int[blockNumB];
        for (int i = 0; i < blockNumA; i++) {

--- a/source/tensor/core/utilities/XMatrixSegment - 副本.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment - 副本.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+#include <stdarg.h>
+#include <math.h>
+#include "XMatrixSegment.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel
+>> parallelRunner - parallel runner
+>> job - the function to run
+>> opNum - number of operations
+>> rowNum - number of rows
+>> colNum - number of columns
+>> argNum - number of arguments of the jobs
+>> ... - arguments of the jobs
+*/
+void RunParallel2D(XPRunner * parallelRunner, void * job,
+                   int opNum, int rowNum, int colNum, int argNum, ...)
+{
+    if (rowNum == 0 || colNum == 0)
+        return;
+    int jobNum = 1;
+    if (parallelRunner != NULL && (parallelRunner->method == PRUNNER_SINGLE || parallelRunner->method == PRUNNER_MULTIPLE)) {
+        if (opNum >= parallelRunner->minimumOPNum * parallelRunner->threadNum)
+            jobNum = parallelRunner->GetJobNum(rowNum * colNum);
+    }
+    CheckNTErrors(jobNum != 0, "TODO!");
+    /* argument list of the jobs */
+    XList * jobArgList = new XList(argNum);
+    va_list ap;
+    va_start(ap, argNum);
+    for (int i = 0; i < argNum; i++) {
+		XTensor* p = va_arg(ap, XTensor*);
+        jobArgList->Add(p);
+    }
+    va_end(ap);
+    /* prepare the neccesary argument list for parallel processing */
+    XList * jobs = new XList(jobNum);
+    XList * args = new XList(jobNum);
+    int * indexList = new int[jobNum * 4 * 4];
+    /* segment the matrix into blocks */
+    int nblock = SegmentTensor2D(rowNum, colNum, jobNum, indexList);
+    /*
+    assign jobs
+    argument rules:
+    1. block information
+    2. other arguments
+    */
+    for (int i = 0; i < jobNum; i++) {
+        XList * blockArgs = new XList(argNum + 4);
+        int * blockIndex = indexList + i * 4;
+        blockArgs->Add((XTensor*)blockIndex);
+        blockArgs->Add((XTensor*)blockIndex + 1);
+        blockArgs->Add((XTensor*)blockIndex + 2);
+        blockArgs->Add((XTensor*)blockIndex + 3);
+        for (int j = 0; j < argNum; j++)
+            blockArgs->Add(jobArgList->GetItem(j));
+        args->Add((XTensor*)blockArgs);
+        jobs->Add((XTensor*)job);
+    }
+    args->count = nblock;
+    jobs->count = nblock;
+    /* single job */
+    if (jobNum == 1)
+        ((TFunction)job)((XList*)args->GetItem(0));
+    /* multiple jobs */
+    else
+        parallelRunner->Run(jobs, args);
+    /* free the memory */
+    delete[] indexList;
+    for (int i = 0; i < args->count; i++) {
+        XList * blockArgs = (XList*)args->GetItem(i);
+        delete blockArgs;
+    }
+    delete args;
+    delete jobs;
+    delete jobArgList;
+}
+/*
+segment a block into sub-blocks
+>> rowNum - number of rows
+>> colNum - number of columns
+>> blockNum - number of sub-blocks
+>> blockIndex - upper-left and bottom-right corners of each sub-block
+<< return - the number of resulting sub-blocks
+*/
+int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
+{
+    int total = rowNum * colNum;
+    int rowSize = (int)ceil(sqrt((float)total / blockNum));
+    int colSize = rowSize;
+    /* a narrow matrix */
+    if (rowSize > colNum * 0.9) {
+        rowSize = colNum;
+        colSize = (int)ceil((float)rowNum / blockNum);
+    }
+    /* a narrow matrix */
+    if (colSize > rowNum * 0.9) {
+        colSize = rowNum;
+        rowSize = (int)ceil((float)colNum / blockNum);
+    }
+    if (blockNum == 1) {
+        colSize = rowNum;
+        rowSize = colNum;
+    }
+    CheckNTErrors((colSize <= rowNum && rowSize <= colNum),
+        "Too large block!");
+    int x1, y1, x2, y2;
+    int xMax = rowNum - 1;
+    int yMax = colNum - 1;
+    int nblock = 0, nitem = 0;
+    int * indexList = blockIndex;
+    int xSegNum = int((float)rowNum / colSize);
+    int ySegNum = int((float)colNum / rowSize);
+    int marginBlockNum = blockNum - xSegNum * ySegNum;
+    /*
+    To maximize the number of resulting sub-block, we have to
+    make use of the margin block
+    */
+    if (blockNum > 1 && marginBlockNum > 0) {
+        int margin = 0;
+        int step = 0;
+        if (rowNum < colNum) {
+            margin = int(((float)marginBlockNum / blockNum) * colNum);
+            step = (int)ceil((float)rowNum / marginBlockNum);
+            x1 = 0;
+            y1 = yMax - margin + 1;
+            x2 = step - 1;
+            y2 = yMax;
+            while (x2 <= xMax) {
+                int * blockIndex = indexList + nblock * 4;
+                blockIndex[0] = x1; blockIndex[1] = y1;
+                blockIndex[2] = x2; blockIndex[3] = y2;
+                nblock++;
+                nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
+                if (x2 == xMax)
+                    break;
+                x1 = x2 + 1;
+                x2 = x1 + step - 1;
+                if (x2 > xMax)
+                    x2 = xMax;
+            }
+            yMax -= margin;
+        }
+        else {
+            margin = int(((float)marginBlockNum / blockNum) * rowNum);
+            step = (int)ceil((float)colNum / marginBlockNum);
+            x1 = xMax - margin + 1;
+            y1 = 0;
+            x2 = xMax;
+            y2 = step - 1;
+            while (y2 <= yMax) {
+                int * blockIndex = indexList + nblock * 4;
+                blockIndex[0] = x1; blockIndex[1] = y1;
+                blockIndex[2] = x2; blockIndex[3] = y2;
+                nblock++;
+                nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
+                if (y2 == yMax)
+                    break;
+                y1 = y2 + 1;
+                y2 = y1 + step - 1;
+                if (y2 > yMax)
+                    y2 = yMax;
+            }
+            xMax -= margin;
+        }
+        colSize = (int)ceil((float)(xMax + 1) / xSegNum);
+        rowSize = (int)ceil((float)(yMax + 1) / ySegNum);
+    }
+    x1 = 0;
+    y1 = 0;            // upper-left corner
+    x2 = colSize - 1;
+    y2 = rowSize - 1;  // bottom-right corner
+    /* the main body of the matrix (after removing the margin block) */
+    while (x1 <= xMax) {
+        y1 = 0;
+        x2 = x1 + colSize - 1;
+        y2 = y1 + rowSize - 1;
+        if (x2 > xMax) {
+            x2 = xMax;
+        }
+        while (y2 <= yMax) {
+            int * blockIndex = indexList + nblock * 4;
+            blockIndex[0] = x1; blockIndex[1] = y1;
+            blockIndex[2] = x2; blockIndex[3] = y2;
+            nblock++;
+            nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
+            if (y2 == yMax)
+                break;
+            y1 = y2 + 1;
+            y2 = y1 + rowSize - 1;
+            if (y2 > yMax)
+                y2 = yMax;
+            CheckNTErrors((nblock <= blockNum),
+                "Fail to segment the matrix!");
+        }
+        x1 = x2 + 1;
+    }
+    CheckNTErrors(nitem == rowNum * colNum,
+        "Fail to segment the matrix!");
+    return nblock;
+}
+/*
+segment a block into sub-blocks (each block consists of a number of rows)
+>> rowNum - number of rows
+>> colNum - number of columns
+>> blockNum - number of sub-blocks
+>> blockIndex - upper-left and bottom-right corners of each sub-block
+<< return - the number of resulting sub-blocks
+*/
+int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex)
+{
+    if (rowNum < blockNum) {
+        blockIndex[0] = 0;
+        blockIndex[1] = 0;
+        blockIndex[2] = rowNum - 1;
+        blockIndex[3] = colNum - 1;
+        return 1;
+    }
+    int segSize = (int)ceil((float)rowNum / blockNum);
+    int x1 = 0;
+    int x2 = x1 + segSize - 1;
+    int y1 = 0;
+    int y2 = colNum - 1;
+    int last = rowNum - 1;
+    int nblock = 0;
+    while (x1 <= last) {
+        x2 = x1 + segSize - 1;
+        if (x2 > last) {
+            x2 = last;
+        }
+        int * blockInfo = blockIndex + 4 * nblock;
+        blockInfo[0] = x1;
+        blockInfo[1] = y1;
+        blockInfo[2] = x2;
+        blockInfo[3] = y2;
+        nblock++;
+        if (x2 == last)
+            break;
+        x1 += segSize;
+    }
+    return nblock;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
@@ -51,12 +51,12 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    CheckNTErrors(jobNum != 0, "TODO!");
    /* argument list of the jobs */
-    XList * jobArgList = new XList(4);
+    XList * jobArgList = new XList(argNum);
    va_list ap;
    va_start(ap, argNum);
    for (int i = 0; i < argNum; i++) {
-        void * p = va_arg(ap, void*);
+		XTensor* p = va_arg(ap, XTensor*);
        jobArgList->Add(p);
    }
    va_end(ap);
@@ -77,27 +77,30 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    2. other arguments
    */
    for (int i = 0; i < jobNum; i++) {
-        XList * blockArgs = new XList(argNum + 4);
+		IntList* indexArgs = new IntList(4);
+        XList * blockArgs = new XList(argNum);
        int * blockIndex = indexList + i * 4;
-        blockArgs->Add(blockIndex);
+		indexArgs->Add(blockIndex);
-        blockArgs->Add(blockIndex + 1);
+		indexArgs->Add(blockIndex + 1);
-        blockArgs->Add(blockIndex + 2);
+		indexArgs->Add(blockIndex + 2);
-        blockArgs->Add(blockIndex + 3);
+		indexArgs->Add(blockIndex + 3);
        for (int j = 0; j < argNum; j++)
            blockArgs->Add(jobArgList->GetItem(j));
-        args->Add(blockArgs);
+		args->Add((XTensor*)indexArgs);
-        jobs->Add((void*)job);
+        args->Add((XTensor*)blockArgs);
+        jobs->Add((XTensor*)job);
    }
-    args->count = nblock;
    jobs->count = nblock;
    /* single job */
    if (jobNum == 1)
-        ((TFunction)job)((XList*)args->GetItem(0));
+        ((TFunction)job)(args);
    /* multiple jobs */
    else
        parallelRunner->Run(jobs, args);

--- a/source/tensor/data/DataSet.cpp
+++ b/source/tensor/data/DataSet.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
+*/
+#include "DataSet.h"
+#include "StringUtil.h"
+#include <string>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+using namespace nts;
+const int PAD = 0;
+/*
+load data from the file to the buffer
+the data format: 
+each line: field1 ||| field2 ||| ... ||| fieldn
+i.e. fields are separated by `|||` and tokens are separated by space
+this will sort the data in ascending order if `sortBuffer` is True
+*/
+void DataSet::LoadDataToBuffer()
+{
+    string line;
+    buffer.clear();
+    bufferUsed = 0;
+    const string tokenDelimiter = " ";
+    const string fieldsDelimiter = "|||";
+    int counter = bufferSize;
+    while (getline(*fp, line) && counter) {
+        auto fields = Split<string>(line, fieldsDelimiter);
+        for (const auto& field : fields) {
+            auto elements = Split<int>(field, tokenDelimiter);
+            buffer.emplace_back(elements);
+        }
+        counter--;
+    }
+    if (fp->eof()) {
+        fp->seekg(fp->beg);
+        // LOG("start a new epoch");
+	}
+    //if (sortBuffer) {
+    //    auto myCompare = [](const auto& a, const auto& b) {
+    //        return a.first.size() < b.first.size();
+    //    };
+    //    sort(buffer.begin(), buffer.end(), myCompare);
+    //}
+}
+/*
+select a field and generate a mini-batch by indices
+>>> t - a tensor to store the batch
+>>> indices - the indices of data
+>>> offset - the number of indices
+>>> field - indicates which field is selected
+this will combine selected elements into a padded batch
+*/
+void DataSet::LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field)
+{
+    if (bufferUsed == bufferSize || bufferUsed == 0) {
+        LoadDataToBuffer();
+	}
+    /* get the maximum length in a mini-batch */
+    size_t maxLength = 0;
+    for (size_t i = 0; i < batchSize; ++i) {
+        int idx = *(indices + i);
+        maxLength = max(maxLength, buffer[idx * fieldNum + field].size());
+    }
+    int* data = new int[maxLength * batchSize];
+    memset(data, 0, maxLength * batchSize);
+    size_t cur = 0;
+    for (size_t i = 0; i < batchSize; ++i) {
+        size_t next = cur + maxLength;
+        int idx = *(indices + i);
+        for (int v : buffer[idx * fieldNum + field]) {
+            data[cur++] = v;
+		}
+        /* pad zeros */
+        while (cur < next) {
+            data[cur++] = 0;
+        }
+    }
+    InitTensor2D(&t, batchSize, maxLength, X_INT);
+    t.SetData(data, maxLength * batchSize);
+    bufferUsed += batchSize;
+    delete[] data;
+}
+/*
+the constructor of DataSet
+>>> fname - path of the data file
+>>> paraFieldNum - the number of different fields
+>>> paraBufferSize - size of each field in the buffer
+the real size of buffer is `bufferSize * fieldNum`
+*/
+DataSet::DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize)
+{
+    fp = new ifstream(fname);
+    fieldNum = paraFieldNum;
+    bufferSize = paraBufferSize;
+    bufferUsed = 0;
+    buffer.reserve(bufferSize * fieldNum);
+    CheckNTErrors(fp, "unable to open the file: %s", fname);
+}
--- a/source/tensor/data/DataSet.h
+++ b/source/tensor/data/DataSet.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+*/
+#ifndef __DATASET_H__
+#define __DATASET_H__
+#include "../XGlobal.h"
+#include "../XTensor.h"
+#include <cstdio>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
+using namespace std;
+using BatchType = vector<vector<int>>;
+using BucketType = vector<vector<int>>;
+using BufferType = vector<vector<int>>;
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* A `DataSet` is associated with a file which contains variable length data.*/
+class DataSet {
+private:
+    /* the data buffer */
+    BufferType buffer;
+    /* the pointer to file stream */
+    ifstream* fp;
+    /* number of different fields */
+    size_t fieldNum;
+    /* size of the data buffer */
+    size_t bufferSize;
+    /* size of used data in buffer */
+    size_t bufferUsed;
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+public:
+    /* sort data in the buffer */
+    virtual void Sort(){};
+    /* modify data in the buffer */
+    virtual void Process(){};
+    /* group data to buckets */
+    virtual BucketType Bucketing() { return BucketType(); };
+    /* generate a mini-batch */
+    void LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field);
+    /* constructor */
+    explicit DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize);
+};
+} // namespace nts(NiuTrans.Tensor)
+#endif // __DATASET_H__
\ No newline at end of file
--- a/source/tensor/data/Embedding.h
+++ b/source/tensor/data/Embedding.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-02
+*/
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__
+#include "../XGlobal.h"
+#include "../XTensor.h"
+#include <vector>
+#include <cstdio>
+#include <fstream>
+#include <unordered_map>
+using namespace std;
+using namespace nts;
+/* 
+ * A `Embedding` is associated with an embeddings table(embedNum * embedDim).
+ * The table can be loaded from a file or constructed from an instance.
+ */
+class Embedding {
+public:
+    XTensor embeddingTable;
+public:
+    /* save the embeddings table to a file */
+    void Dump(const char* fname);
+    /* looks up ids in a list of embedding tensors */
+    XTensor EmbeddingLookup(const XTensor& ids);
+    /* load an embeddings table from a file */
+    explicit Embedding(const char* fname);
+    /* construct table from an XTensor instance */
+    explicit Embedding(XTensor& table);
+    /* construct table from a array instance */
+    explicit Embedding(const float* p, size_t embedNum, size_t embedDim);
+};
+#endif // __EMBEDDING_H__
\ No newline at end of file
--- a/source/tensor/data/StringUtil.cpp
+++ b/source/tensor/data/StringUtil.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+*/
+#include "StringUtil.h"
+namespace nts {
+/* split string by delimiter, this will return indices of all sub-strings */
+vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
+{
+    vector<pair<int, int>> fields;
+    if (delimiter.length() == 0) {
+        fields.emplace_back(0, s.length());
+        return fields;
+    }
+    int pos = 0;
+    int start = 0;
+    while ((pos = s.find(delimiter, start)) != string::npos) {
+        if (pos != start) {
+            fields.emplace_back(start, pos);
+        }
+        start = pos + delimiter.length();
+    }
+    if (start != s.length()) {
+        fields.emplace_back(start, s.length());
+    }
+    return fields;
+}
+}
\ No newline at end of file
--- a/source/tensor/data/StringUtil.h
+++ b/source/tensor/data/StringUtil.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+ */
+#ifndef __STRING_UTIL_H__
+#define __STRING_UTIL_H__
+#include <cstdlib>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace std;
+namespace nts {
+/* Splits a string based on the given delimiter string. Each pair in the
+ * returned vector has the start and past-the-end positions for each of the
+ * parts of the original string. Empty fields are not represented in the output.
+ */
+vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
+/* Splits the given string and converts each part to the given T. */
+template <typename T>
+vector<T> Split(const string& s, const string& delimiter);
+template <>
+inline vector<string> Split(const string& s, const string& delimiter)
+{
+    vector<string> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(s.substr(p.first, p.second - p.first));
+    }
+    return fields;
+}
+template <>
+inline vector<int> Split(const string& s, const string& delimiter)
+{
+    vector<int> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}
+template <>
+inline vector<int64_t> Split(const string& s, const string& delimiter)
+{
+    vector<int64_t> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}
+template <>
+inline vector<float> Split(const string& s, const string& delimiter)
+{
+    vector<float> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtod(s.data() + p.first, nullptr));
+    }
+    return fields;
+}
+template <>
+inline vector<uint8_t> Split(const string& s, const string& delimiter)
+{
+    vector<uint8_t> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}
+template <>
+inline vector<bool> Split(const string& s, const string& delimiter)
+{
+    vector<bool> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(
+            static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
+    }
+    return fields;
+}
+} // namespace nts
+#endif // __STRING_UTIL_H__
--- a/source/tensor/data/Vocabulary.cpp
+++ b/source/tensor/data/Vocabulary.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+*/
+#include "Vocabulary.h"
+#include "StringUtil.h"
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+#include <map>
+#include <string>
+#include <vector>
+using namespace nts;
+/* initialize the mappings with some special tokens */
+void nts::Vocabulary::Init()
+{
+    const std::map<std::string, int> init{
+        { "<UNK>", 0 },
+        { "<PAD>", 1 },
+        { "<SOS>", 2 },
+        { "<EOS>", 3 },
+    };
+    for (const auto& v : init) {
+        token2ID[v.first] = v.second;
+        id2Token[v.second] = v.first;
+    }
+}
+/*
+load vocabulary from disk
+format(each line): token<\n>
+*/
+nts::Vocabulary::Vocabulary(const char* vocabPath)
+{
+    Init();
+    std::ifstream file(vocabPath);
+    string line;
+    int index = int(token2ID.size());
+    while (getline(file, line)) {
+        size_t pos = line.find("\n");
+        string token = line.substr(0, pos);
+        token2ID[token] = index;
+        id2Token[index++] = token;
+    }
+    currentSize = int(token2ID.size());
+}
+/* 
+build vocabulary from the source file
+>> srcPath - the path of source file
+>> minFreq - the user specified minimum count for a single token
+>> maxSize - the user specified maxmum size of a vocabulary
+this will sort tokens by frequency and map the most uncommon tokens to <UNK>
+*/
+nts::Vocabulary::Vocabulary(const char* srcPath, int minFreq, int maxSize)
+{
+    Init();
+    using Dict = vector<pair<string, int>>;
+    /* count word and their frequency  */
+    std::ifstream file(srcPath);
+    map<string, int> dict;
+    string token;
+    while (file >> token) {
+        dict[token] += 1;
+    }
+    /* sort tokens by frequency */
+    Dict dictFlatten;
+    copy(dict.begin(), dict.end(), back_inserter<Dict>(dictFlatten));
+    auto myCompare = [](const auto& a, const auto& b) {
+        return a.second < b.second;
+    };
+    sort(dictFlatten.begin(), dictFlatten.end(), myCompare);
+    /* skip token whose frequency is out of top-maxSize or less than minFreq */
+    size_t index = token2ID.size();
+    size_t validSize = maxSize > dict.size() ? maxSize : dict.size();
+    for (const auto& tokenID : dictFlatten) {
+        if (validSize && tokenID.second > minFreq && 
+			token2ID.find(tokenID.first) == token2ID.end()) {
+            id2Token[int(index)] = tokenID.first;
+            token2ID[tokenID.first] = int(index);
+            index++;
+            validSize--;
+        }
+    }
+    currentSize = int(token2ID.size());
+}
+/* insert a token to the vocabulary */
+void nts::Vocabulary::Insert(string token)
+{
+    if (token2ID.find(token) != token2ID.end()) {
+        token2ID[token] = currentSize;
+        id2Token[currentSize++] = token;
+    }
+}
+/* save the vocabulary to a file */
+void nts::Vocabulary::Dump(const char* vocabPath)
+{
+    std::ofstream file(vocabPath);
+    for (const auto& tokenID : token2ID)
+        file << tokenID.first << "\t" << tokenID.second << "\n";
+}
+/* 
+maps tokens to integers 
+>>> tokens - a list of tokens
+<<< indices - a list of corresponding indices
+notices that this will map OOV to UNK
+*/
+vector<int> nts::Vocabulary::Token2ID(vector<string> tokens)
+{
+    vector<int> indices;
+    indices.reserve(tokens.size());
+    for (auto str : tokens) {
+        if (token2ID.find(str) == token2ID.end())
+            indices.emplace_back(token2ID["<UNK>"]);
+        else
+            indices.emplace_back(token2ID[str]);
+    }
+    return indices;
+}
+/* 
+maps integers to tokens 
+>>> indices - a list of indices
+<<< tokens - a list of corresponding tokens
+notices that this will throw a error if the id is not found
+*/
+vector<string> nts::Vocabulary::ID2Token(vector<int> ids)
+{
+    vector<string> tokens;
+    tokens.reserve(ids.size());
+    for (auto id : ids) {
+        CheckNTErrors(id2Token.find(id) != id2Token.end(), "id not found!");
+        tokens.emplace_back(id2Token[id]);
+    }
+    return tokens;
+}
--- a/source/tensor/data/Vocabulary.h
+++ b/source/tensor/data/Vocabulary.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+*/
+#ifndef __VOCABULARY_H__
+#define __VOCABULARY_H__
+#include "../XGlobal.h"
+#include "../XTensor.h"
+#include <cstdio>
+#include <fstream>
+#include <unordered_map>
+#define LOG(...)                      \
+    do {                              \
+        fprintf(stderr, "[INFO] ");   \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+#define ERR(...)                      \
+    do {                              \
+        fprintf(stderr, "[ERROR] ");  \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+using namespace std;
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+ * A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
+ * Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
+ * For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
+ */
+class Vocabulary {
+private:
+    /* current size of the vocabulary */
+    int currentSize = 0;
+    /* initialize mappings with some special tokens */
+    void Init();
+public:
+    /* a dict maps tokens to indices */
+    unordered_map<string, int> token2ID;
+    /* a dict maps indices to tokens */
+    unordered_map<int, string> id2Token;
+    /* save the vocabulary to a file */
+    void Dump(const char* vocabPath);
+    /* append token to the vocabulary */
+    void Insert(string token);
+    /* maps integers to tokens */
+    vector<string> ID2Token(vector<int> ids);
+    /* maps tokens to integers */
+    vector<int> Token2ID(vector<string> tokens);
+	/* constructor */
+	/* load the vocabulary from a file  */
+    explicit Vocabulary(const char* vocabPath);
+    /* built the vocabulary from the source file */
+    explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
+};
+} // namespace nts(NiuTrans.Tensor)
+#endif // __VOCABULARY_H__
--- a/source/tensor/data/deprecated/DataReader.cpp
+++ b/source/tensor/data/deprecated/DataReader.cpp
--- a/source/tensor/data/deprecated/DataReader.h
+++ b/source/tensor/data/deprecated/DataReader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+*/
+#ifndef _DATA_READER_H_
+#define _DATA_READER_H_
+#include "../XGlobal.h"
+#include "../XTensor.h"
+#include <cstdio>
+#include <fstream>
+#include <unordered_map>
+#define LOG(...)                      \
+    do {                              \
+        fprintf(stderr, "[INFO] ");   \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+#define ERR(...)                      \
+    do {                              \
+        fprintf(stderr, "[ERROR] ");  \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+using namespace std;
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+ * A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
+ * Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
+ * For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
+ */
+class Vocabulary {
+private:
+    /* current size of the vocabulary */
+    int currentSize = 0;
+    /* initialize mappings with some special tokens */
+    void Init();
+public:
+    /* a dict maps tokens to indices */
+    unordered_map<string, int> token2ID;
+    /* a dict maps indices to tokens */
+    unordered_map<int, string> id2Token;
+    Vocabulary() = delete;
+    Vocabulary(Vocabulary&&) = delete;
+    /* load the vocabulary from a file  */
+    explicit Vocabulary(const char* vocabPath);
+    /* built the vocabulary from the source file */
+    explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
+    /* append token to the vocabulary */
+    void Insert(string token);
+    /* save the vocabulary to a file */
+    void Dump(const char* vocabPath);
+    /* maps tokens to integers */
+    vector<int> Token2ID(vector<string> tokens);
+    /* maps integers to tokens */
+    vector<string> ID2Token(vector<int> ids);
+};
+/* 
+ * A DataSet maintains a sequences buffer.
+ * An instance is tied to a specified file and buffer size.
+ */
+class DataSet {
+public:
+    /* number of buffered sequences */
+    int bufferSize;
+    /* the pointer to file stream */
+    ifstream* fp;
+    /* source-side vocabulary */
+    Vocabulary* srcVocab;
+    /* target-side vocabulary */
+    Vocabulary* tgtVocab;
+    /* sequences buffer */
+    vector<pair<vector<int>, vector<int>>> dataBuf;
+    DataSet(const DataSet&) = delete;
+    DataSet(DataSet&&) = delete;
+    /* constructor */
+    explicit DataSet(const char* fname, const char* srcVocabPath, const char* tgtVocabPath, int paraBufferSize, bool isShuffled)
+    {
+        bufferSize = paraBufferSize;
+        srcVocab = new Vocabulary(srcVocabPath);
+        tgtVocab = new Vocabulary(tgtVocabPath);
+        string trainFN = fname;
+#ifndef WIN32
+        if (isShuffled) {
+            Shuffle(trainFN);
+            trainFN += ".random";
+        }
+#endif
+        fp = new ifstream();
+        fp->open(trainFN);
+        CheckNTErrors(fp, "unable to open %s", fname);
+    }
+    /* destructor */
+    ~DataSet()
+    {
+        fp->close();
+    }
+    /* loading data to the buffer */
+    void LoadDataToBuf();
+};
+/* 
+ * A DataManager splits buffer to buckets and generates batches.
+ * It must be associated with a dataset.
+ */
+class DataManager {
+private:
+    /* size of a mini-batch */
+    int batchSize;
+    /* the maximum length of a sequence */
+    int maxSeqLen;
+    /* the minimum length of a sequence */
+    int minSeqLen;
+    /* index of current bucket */
+    int bucketIter = 0;
+    /* the associated data set */
+    DataSet* dataSet;
+    /* buckets positon in buffered sequences */
+    vector<vector<int>> buckets;
+    DataManager() = delete;
+    DataManager(DataManager&&) = delete;
+    DataManager(const DataManager&) = delete;
+    /* splits sequences to buckets */
+    void BucketSeqByLen(int batchSize);
+public:
+    /* constructor */
+    explicit DataManager(const char* fname,
+        const char* srcVocab, const char* tgtVocab,
+        int paraBufferSize, bool isShuffled,
+        int paraMaxSeqLen, int paraMinSeqLen, int paraBatchSize)
+    {
+        batchSize = paraBatchSize;
+        maxSeqLen = paraMaxSeqLen;
+        minSeqLen = paraMinSeqLen;
+        dataSet = new DataSet(fname, srcVocab, tgtVocab, paraBufferSize, isShuffled);
+    }
+    /* destructor */
+    ~DataManager()
+    {
+        delete dataSet;
+    }
+    /* loads a mini-batch */
+    void LoadBatch(XTensor* batchEnc,
+        XTensor* paddingEnc, XTensor* batchDec,
+        XTensor* paddingDec, XTensor* label,
+        int devID, XMem* mem);
+};
+/* shuffle a file and pipe it to an output file */
+void Shuffle(const char* srcFile);
+} // namespace nts(NiuTrans.Tensor)
+#endif // _DATA_READER_H_
--- a/source/tensor/data/deprecated/T2TDataReader.cpp
+++ b/source/tensor/data/deprecated/T2TDataReader.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+*/
+#include "T2TDataReader.h"
+#include "StringUtil.h"
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+void nts::T2TDataSet::Process()
+{
+}
+/* split buffer to buckets */
+void nts::T2TDataManager::BucketSeqByLen(int batchSize)
+{
+    t2tDataSet->LoadDataToBuf();
+    auto myCompare = [](const auto& a, const auto& b) {
+        return a.size() < b.size();
+    };
+    sort(dataSet->dataBuf.begin(), dataSet->dataBuf.end(), myCompare);
+    int realBatchSize = 0;
+    for (int i = 0; i < dataSet->dataBuf.size(); ++i) {
+        /* the first allocation */
+        if (!bucketBoundary.size()) {
+            bucketBoundary.emplace_back(1);
+        }
+        /* append the current sentence to bucket if the total size is suitable  */
+        int seqLen = dataSet->dataBuf[i].first.size();
+        int seqNum = bucketBoundary.back().size();
+        if ((seqNum + 1) * seqLen < batchSize && seqLen <= maxSeqLen && seqLen >= minSeqLen) {
+            bucketBoundary.back().push_back(i);
+        }
+        /* allocate a new bucket */
+        else {
+            bucketBoundary.emplace_back(1);
+        }
+    }
+    /* drop the last bucket if its size is too small */
+    if (bucketBoundary.back().size() * dataSet->dataBuf.back().first.size() < batchSize / 2)
+        bucketBoundary.pop_back();
+}
+/*
+load a batch of sequences (for MT)
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+void nts::T2TDataManager::LoadBatchMT(XTensor* batchEnc, 
+	XTensor* paddingEnc, XTensor* batchDec, 
+	XTensor* paddingDec, XTensor* label, 
+	int devID, XMem* mem, bool isTraining)
+{
+    if (bucketIter == bucketBoundary.size() - 1) {
+        BucketSeqByLen(batchSize);
+        bucketIter = 0;
+    }
+    auto bucket = bucketBoundary[bucketIter++];
+    int sentenceCount = bucket.size();
+    int maxEnc = dataSet->dataBuf[bucket.back()].first.size();
+    int maxDec = dataSet->dataBuf[bucket.back()].second.size();
+    InitTensor2D(batchEnc, sentenceCount, maxEnc, X_INT, devID, mem);
+    InitTensor2D(paddingEnc, sentenceCount, maxEnc, X_FLOAT, devID, mem);
+    InitTensor2D(batchDec, sentenceCount, maxDec, X_INT, devID, mem);
+    InitTensor2D(paddingDec, sentenceCount, maxDec, X_FLOAT, devID, mem);
+    InitTensor2D(label, sentenceCount, maxDec, X_INT, devID, mem);
+    batchEnc->SetZeroAll();
+    paddingEnc->SetZeroAll();
+    batchDec->SetZeroAll();
+    paddingDec->SetZeroAll();
+    label->SetZeroAll();
+    int* labelValues = new int[batchDec->unitNum];
+    int* batchEncValues = new int[batchEnc->unitNum];
+    int* batchDecValues = new int[batchDec->unitNum];
+    MTYPE* paddingEncOffsets = new MTYPE[batchEnc->unitNum];
+    MTYPE* paddingDecOffsets = new MTYPE[batchDec->unitNum];
+    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
+    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
+    memset(paddingEncOffsets, 0, sizeof(int) * batchEnc->unitNum);
+    memset(paddingDecOffsets, 0, sizeof(int) * batchDec->unitNum);
+    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
+    int srcIter = 0;
+    int tgtIter = 0;
+    for (const auto& index : bucket) {
+        int srcOffset = srcIter + dataSet->dataBuf[bucket.back()].first.size();
+        int tgtOffset = tgtIter + dataSet->dataBuf[bucket.back()].second.size();
+        for (const auto& src : dataSet->dataBuf[index].first)
+            batchEncValues[srcIter++] = src;
+        for (const auto& src : dataSet->dataBuf[index].second)
+            batchDecValues[tgtIter++] = src;
+        while (srcIter < srcOffset)
+            paddingEncOffsets[srcIter++] = 1.0F;
+        while (tgtIter < tgtOffset)
+            paddingDecOffsets[tgtIter++] = 1.0F;
+    }
+    batchEnc->SetData(batchDecValues, batchEnc->unitNum);
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
+    paddingEnc->SetData(paddingEncOffsets, batchEnc->unitNum);
+    paddingDec->SetData(paddingDecOffsets, batchDec->unitNum);
+    label->SetData(labelValues, label->unitNum);
+    delete[] batchEncValues;
+    delete[] batchDecValues;
+    delete[] labelValues;
+    delete[] paddingEncOffsets;
+    delete[] paddingDecOffsets;
+}
--- a/source/tensor/data/deprecated/T2TDataReader.h
+++ b/source/tensor/data/deprecated/T2TDataReader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
+*/
+#ifndef _T2T_DATA_READER_H_
+#define _T2T_DATA_READER_H_
+#include "../XTensor.h"
+#include "DataReader.h"
+#define LOG(...)                      \
+    do {                              \
+        fprintf(stderr, "[INFO] ");   \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+#define ERR(...)                      \
+    do {                              \
+        fprintf(stderr, "[ERROR] ");  \
+        fprintf(stderr, __VA_ARGS__); \
+        fprintf(stderr, "\n");        \
+        fflush(stdout);               \
+    } while (0)
+using namespace std;
+namespace nts { // namespace nts(NiuTrans.Tensor)
+class T2TDataSet : public DataSet {
+public:
+    /* source-side vocabulary */
+    Vocabulary* srcVocab;
+    /* target-side vocabulary */
+    Vocabulary* tgtVocab;
+    /* sequences buffer */
+    vector<pair<vector<int>, vector<int>>> t2tDataBuf;
+    /* constructor */
+    explicit T2TDataSet(const char* fname, int paraBufferSize,
+        const char* srcVocabPath, const char* tgtVocabPath, bool isShuffled)
+        : DataSet(fname, paraBufferSize, isShuffled)
+    {
+        srcVocab = new Vocabulary(srcVocabPath);
+        tgtVocab = new Vocabulary(tgtVocabPath);
+    }
+    /* destructor */
+    ~T2TDataSet()
+    {
+        delete srcVocab;
+        delete tgtVocab;
+    }
+    /* loading setences to the buffer */
+    void LoadDataToBuf() override;
+    /* post processing of sequences after loading */
+    void Process() override;
+};
+class T2TDataManager : public DataManager {
+public:
+	/* the associated data set */
+	T2TDataSet* t2tDataSet;
+	/* constructor */
+    explicit T2TDataManager(bool isShuffled, const char* fname,
+		int paraBatchSize, int paraMaxSeqLen,
+        int paraMinSeqLen, int paraBufferSize, 
+		const char* srcVocabPath, const char* tgtVocabPath)
+        : DataManager(fname, paraBufferSize, isShuffled, paraMaxSeqLen, paraMinSeqLen, paraBatchSize)
+    {
+        maxSeqLen = paraMaxSeqLen;
+        minSeqLen = paraMinSeqLen;
+        t2tDataSet = new T2TDataSet(fname, paraBufferSize, srcVocabPath, tgtVocabPath, isShuffled);
+    }
+	/*  */
+    void LoadBatchMT(XTensor* batchEnc, XTensor* paddingEnc,
+        XTensor* batchDec, XTensor* paddingDec,
+        XTensor* label, int devID, XMem* mem,
+        bool isTraining);
+};
+} // namespace nts(NiuTrans.Tensor)
+#endif // _T2T_DATA_READER_H_
--- a/source/tensor/data/src.txt
+++ b/source/tensor/data/src.txt
+我 你 他 <EOS> ||| <SOS> I You He <EOS>
+我 你 <EOS> ||| <SOS> I You  <EOS>
\ No newline at end of file
--- a/source/tensor/data/src_vocab.txt
+++ b/source/tensor/data/src_vocab.txt
+我
+你
+他
\ No newline at end of file
--- a/source/tensor/data/tgt_vocab.txt
+++ b/source/tensor/data/tgt_vocab.txt
+I
+You
+He
\ No newline at end of file