tester

da1d7ca8 · xiaotong · 1faabe78 · da1d7ca8 · da1d7ca8 · da1d7ca8
Commit da1d7ca8 authored Apr 25, 2019 by xiaotong
--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include "T2TBatchLoader.h"
+#include "T2TUtility.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TBatchLoader::T2TBatchLoader()
+{
+    seqLen = NULL;
+    seqLen2 = NULL;
+    nseqBuf = 0;
+    nextSeq = -1;
+    nextBatch = -1;
+    buf = NULL;
+    buf2 = NULL;
+    bufBatch = NULL;
+    bufSize = 0;
+    bufBatchSize = 0;
+    seqOffset = NULL;
+}
+
+/* de-constructor */
+T2TBatchLoader::~T2TBatchLoader()
+{
+    delete[] buf;
+    delete[] buf2;
+    delete[] bufBatch;
+    delete[] seqLen;
+    delete[] seqLen2;
+    delete[] seqOffset;
+}
+
+/* 
+initialization 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void T2TBatchLoader::Init(int argc, char ** argv)
+{
+    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
+    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
+    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
+    LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
+    LoadParamInt(argc, argv, "bucketsize", &bucketSize, 0);
+
+    buf  = new int[bufSize];
+    buf2 = new int[bufSize];
+    bufBatch = new BatchNode[bufSize];
+    seqLen  = new int[bufSize];
+    seqLen2 = new int[bufSize];
+    seqOffset = new int[bufSize];
+}
+
+char line[MAX_SEQUENCE_LENGTH];
+
+struct SampleNode
+{
+    int id;
+    int offset;
+    int * p;
+    int size;
+    int value;
+	int key;
+};
+
+int CompareSampleNode(const void * a, const void * b)
+{
+   return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
+}
+
+int CompareSampleNodeV2(const void * a, const void * b)
+{
+    return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
+}
+
+/* 
+load data to buffer 
+>> file - where to load data
+>> isSorted - indicates whether the samples are sorted by length
+>> step - the number of sequences we go over when move to the next sample
+*/
+int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
+{
+    int lineCount = 0;
+    int seqCount = 0;
+    int wordCount = 0;
+    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
+        int len = (int)strlen(line);
+
+        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
+            line[len - 1] = 0;
+            len--;
+        }
+
+        len = (int)strlen(line);
+        if(len == 0)
+            continue;
+        
+        /* how many characters are in a word */
+        int wSize = 0;
+        
+        /* how many words are in the sentence */
+        int wNum = 0;
+        int wNumLocal = 0;
+        int i = 0;
+
+        for(i = 0; i < len; i++){
+            /* load word (id) seperated by space or tab */
+            if((line[i] == ' ' || line[i] == '\t') && wSize > 0){
+                line[i] = 0;
+
+                if(wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|'){
+                    seqLen[seqCount] = wNumLocal;
+                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
+                    seqCount++;
+                    wNumLocal = 0;
+                }
+                else{
+                    buf[wordCount + wNum++] = atoi(line + i - wSize);
+                    wNumLocal++;
+                }
+
+                wSize = 0;
+            }
+            else
+                wSize++;
+        }
+
+        if(wSize > 0){
+            buf[wordCount + wNum++] = atoi(line + i - wSize);
+            wNumLocal++;
+        }
+
+        seqLen[seqCount] = wNumLocal;
+        seqOffset[seqCount] = wordCount + wNum - wNumLocal;
+        seqCount++;
+
+        wordCount += wNum;
+        lineCount++;
+
+        if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
+            break;
+    }
+
+    nseqBuf = seqCount;
+    nextSeq = 0;
+
+    /* sort the sequences by length */
+    if (isSorted) {
+        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
+        SampleNode * nodes = new SampleNode[seqCount];
+        int count = 0;
+        int offset = 0;
+        for (int i = 0; i < seqCount; i += step) {
+            SampleNode &node = nodes[count];
+            node.id = count;
+            node.offset = i;
+            node.p = buf + offset;
+            node.size = 0;
+            int max = 0;
+            for (int j = 0; j < step; j++) {
+                node.size += seqLen[i + j];
+                max = MAX(max, seqLen[i + j]);
+            }
+            node.value = max;
+            node.key = rand();
+            count++;
+            offset += node.size;
+        }
+
+        qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
+
+        /* distribute samples into buckets. In each bucket, sequences have
+           similar a length */
+        if (bucketSize > 0) {
+            int low = 0;
+            int high = low + bucketSize;
+            int n = count - 1;
+            int m = n;
+            int num = 0;
+            while (num < count) {
+                for (m = n; m >= 0; m--) {
+                    if (nodes[m].value > high)
+                        break;
+                }
+
+                qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
+                num += (n - m);
+                n = m;
+                low += bucketSize;
+                high = low + bucketSize;
+            }
+        }
+
+        count = 0;
+        offset = 0;
+        for(int i = 0; i < seqCount; i += step){
+            SampleNode &node = nodes[count];
+            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
+            for(int j = 0; j < step; j++){
+                seqLen2[i + j] = seqLen[node.offset + j];
+                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
+            }
+            count += 1;
+            offset += node.size;
+        }
+
+        int * tmp = buf;
+        buf = buf2;
+        buf2 = tmp;
+        tmp = seqLen;
+
+        seqLen = seqLen2;
+        seqLen2 = tmp;
+
+        delete[] nodes;
+    }
+
+    return lineCount;
+}
+
+/* clear the data buffer */
+void T2TBatchLoader::ClearBuf()
+{
+    nseqBuf = 0;
+    nextSeq = -1;
+}
+
+/*
+load a batch of sequences 
+>> file - the handle to the data file
+>> isLM - indicates whether the data is used for training lms
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
+>> seqs - keep the sequences in an array
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+int T2TBatchLoader::LoadBatch(FILE * file, bool isLM, 
+                          XTensor * batchEnc, XTensor * paddingEnc, 
+                          XTensor * batchDec, XTensor * paddingDec,
+                          XTensor * gold, XTensor * label,
+                          int * seqs,
+                          int vsEnc, int vsDec, int sBatch, int wBatch, 
+                          bool isSorted, int &ws, int &wCount,
+                          int devID, XMem * mem, 
+						  bool isTraining)
+{
+    if(isLM){
+        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
+                           seqs, vsEnc, sBatch, wBatch, 
+                           isSorted, wCount, devID, mem, isTraining);
+    }
+    else{
+        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
+                           seqs, vsEnc, vsDec, sBatch, wBatch, 
+                           isSorted, ws, wCount, devID, mem, isTraining);
+    }
+}
+
+/* 
+load a batch of sequences (for LM)
+>> file - the handle to the data file
+>> isLM - indicates whether the data is used for training lms
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
+>> seqs - keep the sequences in an array
+>> vs - vocabulary size
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+int T2TBatchLoader::LoadBatchLM(FILE * file, 
+                            XTensor * batchEnc, XTensor * paddingEnc,
+                            XTensor * batchDec, XTensor * paddingDec,
+                            XTensor * gold, XTensor * label,
+                            int * seqs,
+                            int vs, int sBatch, int wBatch, 
+                            bool isSorted, int &wCount,
+                            int devID, XMem * mem,
+							bool isTraining)
+{
+    if(nextSeq < 0 || nextSeq >= nseqBuf)
+        LoadBuf(file, isSorted, 1);
+
+    int seq = MAX(nextSeq, 0);
+    int wc = 0;
+    int wn = 0;
+    int sc = 0;
+    int max = 0;
+    while(seq + sc < nseqBuf){
+        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
+        CheckNTErrors(len > 0, "Empty sequence!");
+        wn = len;
+        wc += wn;
+        sc += 1;
+
+        if(max < wn)
+            max = wn;
+
+        int tc = isBigBatch ? wc : max * sc;
+        if(sc >= sBatch && tc >= wBatch)
+            break;
+    }
+
+    wCount = 0;
+    nextSeq = seq + sc;
+
+    if(sc <= 0)
+        return 0;
+
+    int dims[MAX_TENSOR_DIM_NUM];
+    dims[0] = sc;
+    dims[1] = max;
+    dims[2] = vs;
+
+    InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
+    InitTensor2D(label, sc, max, X_INT, devID, mem);
+    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
+    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
+
+    batchEnc->SetZeroAll();
+    label->SetZeroAll();
+    gold->SetZeroAll();
+    paddingEnc->SetZeroAll();
+    paddingDec->SetZeroAll();
+
+    int seqSize = 0;
+    
+    int * batchEncValues = new int[batchEnc->unitNum];
+    int * labelValues = new int[label->unitNum];
+    MTYPE * goldOffsets = new MTYPE[gold->unitNum];
+    MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
+    MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
+
+    int wGold = 0;
+
+    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
+    memset(labelValues, 0, sizeof(int) * label->unitNum);
+
+    for(int s = seq; s < seq + sc; s++){
+        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+        CheckNTErrors(len <= max, "Something is wrong!");
+        for(int w = 0; w < len; w++){
+            int num = buf[seqOffset[s] + w];
+            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
+            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
+            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
+            if (w > 0) {
+                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
+                labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
+            }
+            
+            if (w == len - 1) {
+                if (isDoubledEnd) {
+                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
+                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
+                }   
+                else {
+                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
+                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
+                }
+                    
+            }
+
+            wCount++;
+
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
+        }
+
+        if(seqs != NULL){
+            for(int w = len; w < max; w++)
+                seqs[seqSize++] = -1;
+        }
+    }
+
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    label->SetData(labelValues, label->unitNum);
+    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
+    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
+    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
+
+    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    _ConvertDataType(batchEnc, tmp);
+    _NotEqual(tmp, paddingEnc, 0);
+    DelTensorBuf(tmp);
+        
+    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    _ConvertDataType(batchEnc, tmp2);
+    _NotEqual(tmp2, paddingDec, 0);
+    DelTensorBuf(tmp2);*/
+
+    delete[] batchEncValues;
+    delete[] labelValues;
+    delete[] goldOffsets;
+    delete[] paddingEncOffsets;
+    delete[] paddingDecOffsets;
+
+    fflush(tf);
+
+    return sc;
+}
+
+int CompareBatchNode(const void * a, const void * b)
+{
+    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
+}
+
+
+/*
+load a batch of sequences (for MT)
+>> file - the handle to the data file
+>> batchEnc - the batch of the input sequences
+>> paddingEnc - padding of the input sequences
+>> batchDec - the batch of the output sequences
+>> paddingDec - padding of the output sequences
+>> gold - gold standard
+>> seqs - keep the sequences in an array
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+>> isTraining - indicates whether we are training the model
+*/
+int T2TBatchLoader::LoadBatchMT(FILE * file, 
+                            XTensor * batchEnc, XTensor * paddingEnc, 
+                            XTensor * batchDec, XTensor * paddingDec,
+                            XTensor * gold, XTensor * label,
+                            int * seqs,
+                            int vsEnc, int vsDec, int sBatch, int wBatch, 
+                            bool isSorted, int &ws, int &wCount,
+                            int devID, XMem * mem, 
+							bool isTraining)
+{
+    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
+        LoadBuf(file, isSorted, 2);
+
+        int seq = 0;
+
+        bufBatchSize = 0;
+        nextBatch = 0;
+
+        /* we segment the buffer into batches */
+        while (seq < nseqBuf) {
+
+            int wcEnc = 0;
+            int wcDec = 0;
+            int wnEnc = 0;
+            int wnDec = 0;
+            int maxEnc = 0;
+            int maxDec = 0;
+            int sc = 0;
+
+            while (seq + sc < nseqBuf) {
+
+                /* source-side sequence */
+                wnEnc = seqLen[seq + sc];
+
+                /* target-side sequence */
+                wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
+
+                int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
+                int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
+
+                if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
+                    break;
+
+                wcEnc += wnEnc;
+                sc += 1;
+
+                if (maxEnc < wnEnc)
+                    maxEnc = wnEnc;
+
+                wcDec += wnDec;
+                sc += 1;
+
+                if (maxDec < wnDec)
+                    maxDec = wnDec;
+            }
+
+            BatchNode & batch = bufBatch[bufBatchSize];
+            batch.beg = seq;
+            batch.end = seq + sc;
+            batch.maxEnc = maxEnc;
+            batch.maxDec = maxDec;
+            batch.key = rand();
+
+            bufBatchSize++;
+            seq = seq + sc;
+        }
+
+        if(isRandomBatch)
+            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
+    }
+
+    if(bufBatchSize <= 0)
+        return 0;
+
+    BatchNode & batch = bufBatch[nextBatch++];
+    int seq = batch.beg;
+    int sc = batch.end - batch.beg;
+    int maxEnc = batch.maxEnc;
+    int maxDec = batch.maxDec;
+
+    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
+
+    int sCount = sc/2;
+    int seqSize = 0;
+
+    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID, mem);
+    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
+    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID, mem);
+    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
+    InitTensor2D(label, sCount, maxDec, X_INT, devID, mem);
+    //InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+
+    batchEnc->SetZeroAll();
+    paddingEnc->SetZeroAll();
+    batchDec->SetZeroAll();
+    paddingDec->SetZeroAll();
+    label->SetZeroAll();
+    //gold->SetZeroAll();
+
+    int wCountEnc = 0;
+    int wCountDec = 0;
+    int wCountPad = 0;
+    wCount = 0;
+
+    int * batchEncValues = new int[batchEnc->unitNum];
+    int * batchDecValues = new int[batchDec->unitNum];
+    int * labelValues = new int[label->unitNum];
+    //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
+    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
+    //MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
+
+    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
+    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
+    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
+
+    /* batch of the source-side sequences */
+    for(int s = seq; s < seq + sc; s += 2){
+        int len = seqLen[s];
+        int sent = (s - seq)/2;
+        for(int w = 0; w < len; w++){
+            int num = buf[seqOffset[s] + w];
+            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
+            //paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
+            wCountEnc++;
+        }
+    }
+    ws = wCountEnc;
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
+    XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    _ConvertDataType(batchEnc, tmp);
+    _NotEqual(tmp, paddingEnc, 0);
+    DelTensorBuf(tmp);
+
+    /* batch of the target-side sequences */
+    for(int s = seq + 1; s < seq + sc; s += 2){
+        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+        CheckNTErrors(len <= maxDec, "Something is wrong!");
+        int sent = (s - seq - 1)/2;
+        for(int w = 0; w < len; w++){
+            int num = buf[seqOffset[s] + w];
+            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
+            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
+            if (w < len-1){
+                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
+                wCount++;
+            }
+            if (w > 0) {
+                //goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
+                labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
+            }
+            if (w == len - 1) {
+                if (isDoubledEnd) {
+                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
+                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
+                }
+                else {
+                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
+                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
+                }
+            }
+            //wCount++;
+            wCountDec++;
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
+        }
+
+        if(seqs != NULL){
+            for(int w = len; w < maxDec; w++)
+                seqs[seqSize++] = -1;
+        }
+    }
+
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
+    label->SetData(labelValues, label->unitNum);
+    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
+
+    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    //_ConvertDataType(batchDec, tmp2);
+    //_NotEqual(tmp2, paddingDec, 0);
+    //DelTensorBuf(tmp2);
+
+    //gold->SetDataBatched(goldOffsets, 1.0F, wGold);
+
+    delete[] batchEncValues;
+    delete[] batchDecValues;
+    delete[] labelValues;
+    //delete[] paddingEncOffsets;
+    delete[] paddingDecOffsets;
+    //delete[] goldOffsets;
+
+    return sc;
+}
+
+/* 
+shuffle lines of the file 
+>> srcFile - the source file to shuffle
+>> tgtFile - the resulting file
+*/
+void T2TBatchLoader::Shuffle(const char * srcFile, const char * tgtFile)
+{
+    char * line = new char[MAX_LINE_LENGTH];
+#ifndef WIN32
+    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
+    system(line);
+#else
+    ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
+#endif
+    delete[] line;
+}
+
+}
+    
\ No newline at end of file
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but i'll move to a warm place tomorrow :)
+ */
+
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+
+    /* end position */
+    int end;
+
+    /* maximum word number on the encoder side */
+    int maxEnc;
+
+    /* maximum word number on the decoder side */
+    int maxDec;
+
+    /* a key for sorting */
+    int key;
+};
+
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int * buf;
+
+    /* another buffer */
+    int * buf2;
+
+    /* batch buf */
+    BatchNode * bufBatch;
+
+    /* buffer size */
+    int bufSize;
+
+    /* size of batch buffer */
+    int bufBatchSize;
+
+    /* length of each sequence */
+    int * seqLen;
+
+    /* another array */
+    int * seqLen2;
+
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+
+    /* number of sequences in the buffer */
+    int nseqBuf;
+
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+
+    /* offset for next batch */
+    int nextBatch;
+
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+    /* constructor */
+    T2TBatchLoader();
+
+    /* de-constructor */
+    ~T2TBatchLoader();
+
+    /* initialization */
+    void Init(int argc, char ** argv);
+
+    /* load data to buffer */
+    int LoadBuf(FILE * file, bool isSorted, int step);
+
+    /* clear data buffer */
+    void ClearBuf();
+
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold, XTensor * label,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &ws, int &wCount,
+                  int devID, XMem * mem, 
+				  bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &ws, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
+};
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -319,17 +319,40 @@ void T2TSearch::Collect(T2TStateBundle * beam)

 /* 
 save the output sequences in a tensor 
->> input - input sequences
 >> output - output sequences (for return)
 */
-void T2TSearch::Dump(XTensor * input, XTensor * output)
+void T2TSearch::Dump(XTensor * output)
 {
-    int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < input->order - 1; i++)
-        dims[i] = input->GetDim(i);
-    dims[input->order - 1] = maxLength;
+    int dims[3] = {batchSize, beamSize, maxLength};
+    int * words = new int[maxLength];
+
+    InitTensor(output, 3, dims, X_INT);
+    SetDataFixedInt(*output, -1);
+
+    /* heap for an input sentence in the batch */
+    for(int h = 0; h < batchSize; h++){
+
+        XHeap<MIN_HEAP, float> &heap = fullHypos[h];
+
+        /* for each output in the beam */
+        for(int i = 0; i < beamSize; i++){
+            T2TState * state = (T2TState *)heap.Pop().index;
+            
+            int count = 0;
+
+            /* we track the state from the end to the beginning */
+            while(state != NULL){
+                words[count++] = state->prediction;
+                state = state->last;
+            }
+
+            /* dump the sentence to the output tensor */
+            for(int w = 0; w < count; w++)
+                output->Set3DInt(words[count - w - 1], h, i, w);
+        }
+    }

-    InitTensor(output, input->order, dims, X_INT);
+    delete[] words;
 }

 /* 

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -88,7 +88,7 @@ public:
    void Collect(T2TStateBundle * beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * input, XTensor * output);
+    void Dump(XTensor * output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -25,4 +25,30 @@ using namespace nts;

 namespace transformer
 {
+
+/* constructor */
+T2TTester::T2TTester()
+{
+}
+
+/* de-constructor */
+T2TTester::~T2TTester()
+{
+}
+
+/* initialize the model */
+void T2TTester::InitModel(int argc, char ** argv)
+{
+}
+
+/* 
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+{
+}
+
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -32,6 +32,17 @@ namespace transformer
 class T2TTester
 {
 public:
+    /* constructor */
+    T2TTester();
+
+    /* de-constructor */
+    ~T2TTester();
+
+    /* initialize the model */
+    void InitModel(int argc, char ** argv);
+
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
 };

 }

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -37,32 +37,13 @@ namespace transformer
 /* constructor */
 T2TTrainer::T2TTrainer()
 {
-    seqLen = NULL;
-    seqLen2 = NULL;
-    nseqBuf = 0;
-    nextSeq = -1;
-    nextBatch = -1;
-
    argNum = 0;
    argArray = NULL;
-    buf = NULL;
-    buf2 = NULL;
-    bufBatch = NULL;
-    bufSize = 0;
-    bufBatchSize = 0;
-    seqOffset = NULL;
 }

 /* de-constructor */
 T2TTrainer::~T2TTrainer()
 {
-    delete[] buf;
-    delete[] buf2;
-    delete[] bufBatch;
-    delete[] seqLen;
-    delete[] seqLen2;
-    delete[] seqOffset;
-
    for(int i = 0; i < moments.count; i++){
        XTensor * m = (XTensor*)moments.Get(i);
        delete m;
@@ -106,8 +87,6 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
    LoadParamInt(argc, argv, "vsize", &vSize, 1);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
-    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
-    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
    LoadParamBool(argc, argv, "adam", &useAdam, false);
    LoadParamFloat(argc, argv, "adambeta1", &adamBeta1, 0.9F);
    LoadParamFloat(argc, argv, "adambeta2", &adamBeta2, 0.98F);
@@ -117,22 +96,13 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
    LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
    LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
-    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
-    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, argv, "debug", &isDebugged, false);
-    LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, argv, "bucketsize", &bucketSize, 0);
-
-    buf  = new int[bufSize];
-    buf2 = new int[bufSize];
-    bufBatch = new BatchNode[bufSize];
-    seqLen  = new int[bufSize];
-    seqLen2 = new int[bufSize];
-    seqOffset = new int[bufSize];
+    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);

    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
+
+    batchLoader.Init(argc, argv);
 }

 int tc = 0;
@@ -210,9 +180,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        /* label smoothed gold standard (if needed) */
        XTensor goldSmoothed;
        
-        while (LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-                         NULL, vSize, vSizeTgt,
-                         sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, mem, true)) 
+        while (batchLoader.LoadBatch(file, model->isLM, 
+                                     &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
+                                     NULL, vSize, vSizeTgt,
+                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, mem, true)) 
        {

            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
@@ -370,11 +341,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    /* an array that keeps the sequences */
    int * seqs = new int[MILLION];
    
-    ClearBuf();
+    batchLoader.ClearBuf();

-    while(LoadBatch(file, model->isLM, &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
-                    seqs, vSize, vSizeTgt,
-                    1, 1, false, ws, wc, devID, mem, false))
+    while(batchLoader.LoadBatch(file, model->isLM, 
+                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
+                                seqs, vSize, vSizeTgt,
+                                1, 1, false, ws, wc, devID, mem, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            
@@ -467,611 +439,6 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
    delete[] fn2;
 }

-char line[MAX_SEQUENCE_LENGTH];
-
-struct SampleNode
-{
-    int id;
-    int offset;
-    int * p;
-    int size;
-    int value;
-	int key;
-};
-
-int CompareSampleNode(const void * a, const void * b)
-{
-   return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
-}
-
-int CompareSampleNodeV2(const void * a, const void * b)
-{
-    return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
-}
-
-/* 
-load data to buffer 
->> file - where to load data
->> isSorted - indicates whether the samples are sorted by length
->> step - the number of sequences we go over when move to the next sample
-*/
-int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
-{
-    int lineCount = 0;
-    int seqCount = 0;
-    int wordCount = 0;
-    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
-        int len = (int)strlen(line);
-
-        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
-            line[len - 1] = 0;
-            len--;
-        }
-
-        len = (int)strlen(line);
-        if(len == 0)
-            continue;
-        
-        /* how many characters are in a word */
-        int wSize = 0;
-        
-        /* how many words are in the sentence */
-        int wNum = 0;
-        int wNumLocal = 0;
-        int i = 0;
-
-        for(i = 0; i < len; i++){
-            /* load word (id) seperated by space or tab */
-            if((line[i] == ' ' || line[i] == '\t') && wSize > 0){
-                line[i] = 0;
-
-                if(wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|'){
-                    seqLen[seqCount] = wNumLocal;
-                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-                    seqCount++;
-                    wNumLocal = 0;
-                }
-                else{
-                    buf[wordCount + wNum++] = atoi(line + i - wSize);
-                    wNumLocal++;
-                }
-
-                wSize = 0;
-            }
-            else
-                wSize++;
-        }
-
-        if(wSize > 0){
-            buf[wordCount + wNum++] = atoi(line + i - wSize);
-            wNumLocal++;
-        }
-
-        seqLen[seqCount] = wNumLocal;
-        seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-        seqCount++;
-
-        wordCount += wNum;
-        lineCount++;
-
-        if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
-            break;
-    }
-
-    nseqBuf = seqCount;
-    nextSeq = 0;
-
-    /* sort the sequences by length */
-    if (isSorted) {
-        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-        SampleNode * nodes = new SampleNode[seqCount];
-        int count = 0;
-        int offset = 0;
-        for (int i = 0; i < seqCount; i += step) {
-            SampleNode &node = nodes[count];
-            node.id = count;
-            node.offset = i;
-            node.p = buf + offset;
-            node.size = 0;
-            int max = 0;
-            for (int j = 0; j < step; j++) {
-                node.size += seqLen[i + j];
-                max = MAX(max, seqLen[i + j]);
-            }
-            node.value = max;
-            node.key = rand();
-            count++;
-            offset += node.size;
-        }
-
-        qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
-
-        /* distribute samples into buckets. In each bucket, sequences have
-           similar a length */
-        if (bucketSize > 0) {
-            int low = 0;
-            int high = low + bucketSize;
-            int n = count - 1;
-            int m = n;
-            int num = 0;
-            while (num < count) {
-                for (m = n; m >= 0; m--) {
-                    if (nodes[m].value > high)
-                        break;
-                }
-
-                qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
-                num += (n - m);
-                n = m;
-                low += bucketSize;
-                high = low + bucketSize;
-            }
-        }
-
-        count = 0;
-        offset = 0;
-        for(int i = 0; i < seqCount; i += step){
-            SampleNode &node = nodes[count];
-            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
-            for(int j = 0; j < step; j++){
-                seqLen2[i + j] = seqLen[node.offset + j];
-                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
-            }
-            count += 1;
-            offset += node.size;
-        }
-
-        int * tmp = buf;
-        buf = buf2;
-        buf2 = tmp;
-        tmp = seqLen;
-
-        seqLen = seqLen2;
-        seqLen2 = tmp;
-
-        delete[] nodes;
-    }
-
-    return lineCount;
-}
-
-/* clear the data buffer */
-void T2TTrainer::ClearBuf()
-{
-    nseqBuf = 0;
-    nextSeq = -1;
-}
-
-/*
-load a batch of sequences 
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard
->> seqs - keep the sequences in an array
->> vsEnc - size of the encoder vocabulary
->> vsDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> mem - memory pool
->> isTraining - indicates whether we are training the model
-*/
-int T2TTrainer::LoadBatch(FILE * file, bool isLM, 
-                          XTensor * batchEnc, XTensor * paddingEnc, 
-                          XTensor * batchDec, XTensor * paddingDec,
-                          XTensor * gold, XTensor * label,
-                          int * seqs,
-                          int vsEnc, int vsDec, int sBatch, int wBatch, 
-                          bool isSorted, int &ws, int &wCount,
-                          int devID, XMem * mem, 
-						  bool isTraining)
-{
-    if(isLM){
-        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-                           seqs, vsEnc, sBatch, wBatch, 
-                           isSorted, wCount, devID, mem, isTraining);
-    }
-    else{
-        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-                           seqs, vsEnc, vsDec, sBatch, wBatch, 
-                           isSorted, ws, wCount, devID, mem, isTraining);
-    }
-}
-
-/* 
-load a batch of sequences (for LM)
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard
->> seqs - keep the sequences in an array
->> vs - vocabulary size
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> mem - memory pool
->> isTraining - indicates whether we are training the model
-*/
-int T2TTrainer::LoadBatchLM(FILE * file, 
-                            XTensor * batchEnc, XTensor * paddingEnc,
-                            XTensor * batchDec, XTensor * paddingDec,
-                            XTensor * gold, XTensor * label,
-                            int * seqs,
-                            int vs, int sBatch, int wBatch, 
-                            bool isSorted, int &wCount,
-                            int devID, XMem * mem,
-							bool isTraining)
-{
-    if(nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted, 1);
-
-    int seq = MAX(nextSeq, 0);
-    int wc = 0;
-    int wn = 0;
-    int sc = 0;
-    int max = 0;
-    while(seq + sc < nseqBuf){
-        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
-        CheckNTErrors(len > 0, "Empty sequence!");
-        wn = len;
-        wc += wn;
-        sc += 1;
-
-        if(max < wn)
-            max = wn;
-
-        int tc = isBigBatch ? wc : max * sc;
-        if(sc >= sBatch && tc >= wBatch)
-            break;
-    }
-
-    wCount = 0;
-    nextSeq = seq + sc;
-
-    if(sc <= 0)
-        return 0;
-
-    int dims[MAX_TENSOR_DIM_NUM];
-    dims[0] = sc;
-    dims[1] = max;
-    dims[2] = vs;
-
-    InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
-    InitTensor2D(label, sc, max, X_INT, devID, mem);
-    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
-    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
-
-    batchEnc->SetZeroAll();
-    label->SetZeroAll();
-    gold->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    paddingDec->SetZeroAll();
-
-    int seqSize = 0;
-    
-    int * batchEncValues = new int[batchEnc->unitNum];
-    int * labelValues = new int[label->unitNum];
-    MTYPE * goldOffsets = new MTYPE[gold->unitNum];
-    MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
-    MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
-
-    int wGold = 0;
-
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(labelValues, 0, sizeof(int) * label->unitNum);
-
-    for(int s = seq; s < seq + sc; s++){
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= max, "Something is wrong!");
-        for(int w = 0; w < len; w++){
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
-            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
-            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
-            if (w > 0) {
-                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
-                labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
-            }
-            
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
-                }   
-                else {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
-                }
-                    
-            }
-
-            wCount++;
-
-            if(seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-
-        if(seqs != NULL){
-            for(int w = len; w < max; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
-
-    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
-    _ConvertDataType(batchEnc, tmp);
-    _NotEqual(tmp, paddingEnc, 0);
-    DelTensorBuf(tmp);
-        
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
-    _ConvertDataType(batchEnc, tmp2);
-    _NotEqual(tmp2, paddingDec, 0);
-    DelTensorBuf(tmp2);*/
-
-    delete[] batchEncValues;
-    delete[] labelValues;
-    delete[] goldOffsets;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-
-    fflush(tf);
-
-    return sc;
-}
-
-int CompareBatchNode(const void * a, const void * b)
-{
-    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
-}
-
-
-/*
-load a batch of sequences (for MT)
->> file - the handle to the data file
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard
->> seqs - keep the sequences in an array
->> vsEnc - size of the encoder vocabulary
->> vsDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> mem - memory pool
->> isTraining - indicates whether we are training the model
-*/
-int T2TTrainer::LoadBatchMT(FILE * file, 
-                            XTensor * batchEnc, XTensor * paddingEnc, 
-                            XTensor * batchDec, XTensor * paddingDec,
-                            XTensor * gold, XTensor * label,
-                            int * seqs,
-                            int vsEnc, int vsDec, int sBatch, int wBatch, 
-                            bool isSorted, int &ws, int &wCount,
-                            int devID, XMem * mem, 
-							bool isTraining)
-{
-    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
-        LoadBuf(file, isSorted, 2);
-
-        int seq = 0;
-
-        bufBatchSize = 0;
-        nextBatch = 0;
-
-        /* we segment the buffer into batches */
-        while (seq < nseqBuf) {
-
-            int wcEnc = 0;
-            int wcDec = 0;
-            int wnEnc = 0;
-            int wnDec = 0;
-            int maxEnc = 0;
-            int maxDec = 0;
-            int sc = 0;
-
-            while (seq + sc < nseqBuf) {
-
-                /* source-side sequence */
-                wnEnc = seqLen[seq + sc];
-
-                /* target-side sequence */
-                wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
-
-                int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
-                int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
-
-                if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
-                    break;
-
-                wcEnc += wnEnc;
-                sc += 1;
-
-                if (maxEnc < wnEnc)
-                    maxEnc = wnEnc;
-
-                wcDec += wnDec;
-                sc += 1;
-
-                if (maxDec < wnDec)
-                    maxDec = wnDec;
-            }
-
-            BatchNode & batch = bufBatch[bufBatchSize];
-            batch.beg = seq;
-            batch.end = seq + sc;
-            batch.maxEnc = maxEnc;
-            batch.maxDec = maxDec;
-            batch.key = rand();
-
-            bufBatchSize++;
-            seq = seq + sc;
-        }
-
-        if(isRandomBatch)
-            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
-    }
-
-    if(bufBatchSize <= 0)
-        return 0;
-
-    BatchNode & batch = bufBatch[nextBatch++];
-    int seq = batch.beg;
-    int sc = batch.end - batch.beg;
-    int maxEnc = batch.maxEnc;
-    int maxDec = batch.maxDec;
-
-    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
-
-    int sCount = sc/2;
-    int seqSize = 0;
-
-    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID, mem);
-    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
-    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID, mem);
-    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
-    InitTensor2D(label, sCount, maxDec, X_INT, devID, mem);
-    //InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
-
-    batchEnc->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    batchDec->SetZeroAll();
-    paddingDec->SetZeroAll();
-    label->SetZeroAll();
-    //gold->SetZeroAll();
-
-    int wCountEnc = 0;
-    int wCountDec = 0;
-    int wCountPad = 0;
-    wCount = 0;
-
-    int * batchEncValues = new int[batchEnc->unitNum];
-    int * batchDecValues = new int[batchDec->unitNum];
-    int * labelValues = new int[label->unitNum];
-    //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
-    //MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
-
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
-    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
-
-    /* batch of the source-side sequences */
-    for(int s = seq; s < seq + sc; s += 2){
-        int len = seqLen[s];
-        int sent = (s - seq)/2;
-        for(int w = 0; w < len; w++){
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
-            //paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
-            wCountEnc++;
-        }
-    }
-    ws = wCountEnc;
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-    XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
-    _ConvertDataType(batchEnc, tmp);
-    _NotEqual(tmp, paddingEnc, 0);
-    DelTensorBuf(tmp);
-
-    /* batch of the target-side sequences */
-    for(int s = seq + 1; s < seq + sc; s += 2){
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= maxDec, "Something is wrong!");
-        int sent = (s - seq - 1)/2;
-        for(int w = 0; w < len; w++){
-            int num = buf[seqOffset[s] + w];
-            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
-            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            if (w < len-1){
-                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
-                wCount++;
-            }
-            if (w > 0) {
-                //goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
-                labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
-            }
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
-                }
-                else {
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
-                }
-            }
-            //wCount++;
-            wCountDec++;
-            if(seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-
-        if(seqs != NULL){
-            for(int w = len; w < maxDec; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-
-    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
-
-    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
-    //_ConvertDataType(batchDec, tmp2);
-    //_NotEqual(tmp2, paddingDec, 0);
-    //DelTensorBuf(tmp2);
-
-    //gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-
-    delete[] batchEncValues;
-    delete[] batchDecValues;
-    delete[] labelValues;
-    //delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-    //delete[] goldOffsets;
-
-    return sc;
-}
-
-/* 
-shuffle lines of the file 
->> srcFile - the source file to shuffle
->> tgtFile - the resulting file
-*/
-void T2TTrainer::Shuffle(const char * srcFile, const char * tgtFile)
-{
-    char * line = new char[MAX_LINE_LENGTH];
-#ifndef WIN32
-    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
-    system(line);
-#else
-    ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
-#endif
-    delete[] line;
-}
-    
 /*
 get word probabilities for a batch of sequences
 >> output - word distribution for each position

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -23,35 +23,14 @@
 #define __T2TTRAINER_H__

 #include "T2TModel.h"
-
+#include "T2TBatchLoader.h"
 #include "../../tensor/function/FHeader.h"

-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
 using namespace nts;

 namespace transformer
 {

-/* node to keep batch information */
-struct BatchNode
-{
-    /* begining position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
 /* trainer of the T2T model */
 class T2TTrainer
 {
@@ -61,42 +40,6 @@ public:

    /* parameter array */
    char ** argArray;
-
-    /* buffer for loading words */
-    int * buf;
-
-    /* another buffer */
-    int * buf2;
-
-    /* batch buf */
-    BatchNode * bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int * seqLen;
-
-    /* another array */
-    int * seqLen2;
-
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-    
-    /* indicates whether the sequence is sorted by length */
-    bool isLenSorted;
    
    /* dimension size of each inner layer */
    int d;
@@ -158,26 +101,15 @@ public:
    
    /* number of batches on which we do model update */
    int updateStep;
-    
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;

    /* indicates whether we intend to debug the net */
    bool isDebugged;

-    /* bucket size */
-    int bucketSize;
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* for batching */
+    T2TBatchLoader batchLoader;

 public:
    /* constructor */
@@ -197,46 +129,6 @@ public:

    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
-
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold, XTensor * label,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);