Commit 72c9551b by huchi

refactor XList using template

parent 47c31021
#include "../../source/tensor/data/DataSet.h"
#include <fstream>
#include <iostream>
#include <string>
#include "../tensor/core/arithmetic/MatrixMul.h"
using namespace nts;
void TestDataManager() {
DataSet dataSet("src.txt", 2, 100);
XTensor src, tgt;
enum FIELD {
srcField = 0,
tgtField = 1,
};
const int indices[] = { 0, 1 };
dataSet.LoadBatch(src, indices, sizeof(indices) / sizeof(*indices), srcField);
dataSet.LoadBatch(tgt, indices, sizeof(indices) / sizeof(*indices), tgtField);
//
// tgt.Dump(stderr);
// src.Dump(stderr);
//XListV2<int> list(10);
//int* a = new int[10]{1,2,3,4,5,6,7,8,9};
//list.Add(a);
//auto x = list.Get(0);
//cout << x[0] << endl;
//list.Remove(0);
//auto y = list.Get(0);
//cout << x[0] << endl;
//delete[] a;
XList list(10);
XTensor a,b,c;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 2);
InitTensor2D(&c, 2, 2);
float arr[] = { 1., 2., 3., 4. };
a.SetData(arr, 4);
b.SetData(arr, 4);
//c.SetZeroAll();
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
c.Dump(stderr);
CharList str(10);
char* s = new char(10);
for (int i = 0; i < 9; ++i) {
s[i] = i + 'a';
}
s[9] = 0;
str.Add(s);
cout << str.Get(0);
}
int main()
{
TestDataManager();
return 0;
}
\ No newline at end of file
...@@ -478,7 +478,7 @@ split a string ...@@ -478,7 +478,7 @@ split a string
>> items - splitting result >> items - splitting result
<< return - how many items are there << return - how many items are there
*/ */
int SplitALine(char * inputString, const char * seperator, XList * items) int SplitALine(char * inputString, const char * seperator, CharList * items)
{ {
items->Clear(); items->Clear();
...@@ -532,7 +532,7 @@ get device ids for the given device information ...@@ -532,7 +532,7 @@ get device ids for the given device information
*/ */
int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs) int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
{ {
XList * terms = new XList(1); CharList* terms = new CharList(1);
SplitALine(devInfo, " ", terms); SplitALine(devInfo, " ", terms);
for(int i = 0; i < terms->count; i++){ for(int i = 0; i < terms->count; i++){
......
...@@ -301,8 +301,8 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id ...@@ -301,8 +301,8 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
return; return;
XList list(2); XList list(2);
list.Add(t1); list.Add((XTensor*)t1);
list.Add(t2); list.Add((XTensor*)t2);
MakeLink(&list, h, id); MakeLink(&list, h, id);
} }
...@@ -321,9 +321,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, ...@@ -321,9 +321,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
return; return;
XList list(3); XList list(3);
list.Add(t1); list.Add((XTensor*)t1);
list.Add(t2); list.Add((XTensor*)t2);
list.Add(t3); list.Add((XTensor*)t3);
MakeLink(&list, h, id); MakeLink(&list, h, id);
} }
......
...@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID) ...@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true; isJobQueue = true;
jobDequeuerArgs->Clear(); jobDequeuerArgs->Clear();
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid); // warning: this may cause unknown error
jobDequeuerArgs->Add((XTensor*)this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
jobDequeuer.function = (TFunction)DequeueJobs; jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs; jobDequeuer.argv = jobDequeuerArgs;
......
...@@ -40,15 +40,21 @@ argument7: matrix c (c=a*b*\alpha + c*beta) ...@@ -40,15 +40,21 @@ argument7: matrix c (c=a*b*\alpha + c*beta)
*/ */
void _MatrixMul2DMultiTheading(XList * args) void _MatrixMul2DMultiTheading(XList * args)
{ {
int x1 = *(int*)args->GetItem(0); CheckNTErrors(args->count == 2, "invalid argument number!");
int y1 = *(int*)args->GetItem(1); IntList * indexArgs = (IntList*)args->GetItem(0);
int x2 = *(int*)args->GetItem(2); XList * matrixArgs = (XList*)args->GetItem(1);
int y2 = *(int*)args->GetItem(3); CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
XTensor * a = (XTensor*)args->GetItem(4); CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
XTensor * b = (XTensor*)args->GetItem(5);
XTensor * c = (XTensor*)args->GetItem(6); XTensor * a = matrixArgs->GetItem(0);
DTYPE alpha = *(DTYPE*)args->GetItem(7); XTensor * b = matrixArgs->GetItem(1);
DTYPE beta = *(DTYPE*)args->GetItem(8); XTensor * c = matrixArgs->GetItem(2);
DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
int x1 = *(indexArgs->GetItem(0));
int y1 = *(indexArgs->GetItem(1));
int x2 = *(indexArgs->GetItem(2));
int y2 = *(indexArgs->GetItem(3));
#ifdef FAST_MATRIX #ifdef FAST_MATRIX
int am = a->dimSize[1]; int am = a->dimSize[1];
......
...@@ -139,11 +139,11 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen ...@@ -139,11 +139,11 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
/* tensor connections */ /* tensor connections */
XList list(5); XList list(5);
list.Add(&input); list.Add((XTensor*)&input);
list.Add(&mean); list.Add((XTensor*)&mean);
list.Add(&var); list.Add((XTensor*)&var);
list.Add(&a); list.Add((XTensor*)&a);
list.Add(&b); list.Add((XTensor*)&b);
XLink::MakeLink(&list, &output, MATH_NORMALIZE); XLink::MakeLink(&list, &output, MATH_NORMALIZE);
XLink::AddParamToHeadInt(&output, dim); XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, epsilon); XLink::AddParamToHead(&output, epsilon);
......
...@@ -228,9 +228,9 @@ XTensor CopyIndexed(const XTensor & s, int dim, ...@@ -228,9 +228,9 @@ XTensor CopyIndexed(const XTensor & s, int dim,
_CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum); _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
XList list(3); XList list(3);
list.Add(&s); list.Add((XTensor*)&s);
list.Add(&srcIndex); list.Add((XTensor*)&srcIndex);
list.Add(&tgtIndex); list.Add((XTensor*)&tgtIndex);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED); XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
......
...@@ -148,8 +148,8 @@ concatenate two tensors along a given dimension ...@@ -148,8 +148,8 @@ concatenate two tensors along a given dimension
void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim) void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim)
{ {
XList smalls(2); XList smalls(2);
smalls.Add(smallA); smalls.Add((XTensor*)smallA);
smalls.Add(smallB); smalls.Add((XTensor*)smallB);
_Concatenate(&smalls, big, dim); _Concatenate(&smalls, big, dim);
} }
...@@ -169,8 +169,8 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim) ...@@ -169,8 +169,8 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!"); CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
XList smalls(2); XList smalls(2);
smalls.Add(&smallA); smalls.Add((XTensor*)&smallA);
smalls.Add(&smallB); smalls.Add((XTensor*)&smallB);
bool uniform = true; bool uniform = true;
for (int i = 1; i < smalls.count; i++) { for (int i = 1; i < smalls.count; i++) {
......
...@@ -85,12 +85,12 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim) ...@@ -85,12 +85,12 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
} }
} }
else { else {
XList * sourceArrays = new XList(smalls->count); CharList * sourceArrays = new CharList(smalls->count);
int * blockSizes = new int[smalls->count]; int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize; blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
sourceArrays->Add(tensor->data); sourceArrays->Add((char*)tensor->data);
} }
_MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem); _MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);
......
...@@ -376,8 +376,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge) ...@@ -376,8 +376,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
big.SetTMPFlag(); big.SetTMPFlag();
XList smalls(2); XList smalls(2);
smalls.Add(&smallA); smalls.Add((XTensor*)&smallA);
smalls.Add(&smallB); smalls.Add((XTensor*)&smallB);
/* call _Merge function */ /* call _Merge function */
_Merge(&smalls, &big, whereToMerge); _Merge(&smalls, &big, whereToMerge);
......
...@@ -34,7 +34,7 @@ merge data by blocks ...@@ -34,7 +34,7 @@ merge data by blocks
>> target - target data array >> target - target data array
>> myMem - memory pool >> myMem - memory pool
*/ */
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem) void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -71,7 +71,7 @@ merge data by blocks (cuda version) ...@@ -71,7 +71,7 @@ merge data by blocks (cuda version)
>> target - target data array >> target - target data array
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem) void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!"); CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
......
...@@ -33,7 +33,7 @@ __global__ ...@@ -33,7 +33,7 @@ __global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList); void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
/* merge data by blocks (cuda version) */ /* merge data by blocks (cuda version) */
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* merge data by blocks */ /* merge data by blocks */
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -78,7 +78,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -78,7 +78,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
#endif #endif
} }
else { else {
XList * sourceArrays = new XList(blockNumB); CharList * sourceArrays = new CharList(blockNumB);
int * blockSizes = new int[blockNumB]; int * blockSizes = new int[blockNumB];
for (int i = 0; i < blockNumA; i++) { for (int i = 0; i < blockNumA; i++) {
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include <stdarg.h>
#include <math.h>
#include "XMatrixSegment.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel
>> parallelRunner - parallel runner
>> job - the function to run
>> opNum - number of operations
>> rowNum - number of rows
>> colNum - number of columns
>> argNum - number of arguments of the jobs
>> ... - arguments of the jobs
*/
void RunParallel2D(XPRunner * parallelRunner, void * job,
int opNum, int rowNum, int colNum, int argNum, ...)
{
if (rowNum == 0 || colNum == 0)
return;
int jobNum = 1;
if (parallelRunner != NULL && (parallelRunner->method == PRUNNER_SINGLE || parallelRunner->method == PRUNNER_MULTIPLE)) {
if (opNum >= parallelRunner->minimumOPNum * parallelRunner->threadNum)
jobNum = parallelRunner->GetJobNum(rowNum * colNum);
}
CheckNTErrors(jobNum != 0, "TODO!");
/* argument list of the jobs */
XList * jobArgList = new XList(argNum);
va_list ap;
va_start(ap, argNum);
for (int i = 0; i < argNum; i++) {
XTensor* p = va_arg(ap, XTensor*);
jobArgList->Add(p);
}
va_end(ap);
/* prepare the neccesary argument list for parallel processing */
XList * jobs = new XList(jobNum);
XList * args = new XList(jobNum);
int * indexList = new int[jobNum * 4 * 4];
/* segment the matrix into blocks */
int nblock = SegmentTensor2D(rowNum, colNum, jobNum, indexList);
/*
assign jobs
argument rules:
1. block information
2. other arguments
*/
for (int i = 0; i < jobNum; i++) {
XList * blockArgs = new XList(argNum + 4);
int * blockIndex = indexList + i * 4;
blockArgs->Add((XTensor*)blockIndex);
blockArgs->Add((XTensor*)blockIndex + 1);
blockArgs->Add((XTensor*)blockIndex + 2);
blockArgs->Add((XTensor*)blockIndex + 3);
for (int j = 0; j < argNum; j++)
blockArgs->Add(jobArgList->GetItem(j));
args->Add((XTensor*)blockArgs);
jobs->Add((XTensor*)job);
}
args->count = nblock;
jobs->count = nblock;
/* single job */
if (jobNum == 1)
((TFunction)job)((XList*)args->GetItem(0));
/* multiple jobs */
else
parallelRunner->Run(jobs, args);
/* free the memory */
delete[] indexList;
for (int i = 0; i < args->count; i++) {
XList * blockArgs = (XList*)args->GetItem(i);
delete blockArgs;
}
delete args;
delete jobs;
delete jobArgList;
}
/*
segment a block into sub-blocks
>> rowNum - number of rows
>> colNum - number of columns
>> blockNum - number of sub-blocks
>> blockIndex - upper-left and bottom-right corners of each sub-block
<< return - the number of resulting sub-blocks
*/
int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
{
int total = rowNum * colNum;
int rowSize = (int)ceil(sqrt((float)total / blockNum));
int colSize = rowSize;
/* a narrow matrix */
if (rowSize > colNum * 0.9) {
rowSize = colNum;
colSize = (int)ceil((float)rowNum / blockNum);
}
/* a narrow matrix */
if (colSize > rowNum * 0.9) {
colSize = rowNum;
rowSize = (int)ceil((float)colNum / blockNum);
}
if (blockNum == 1) {
colSize = rowNum;
rowSize = colNum;
}
CheckNTErrors((colSize <= rowNum && rowSize <= colNum),
"Too large block!");
int x1, y1, x2, y2;
int xMax = rowNum - 1;
int yMax = colNum - 1;
int nblock = 0, nitem = 0;
int * indexList = blockIndex;
int xSegNum = int((float)rowNum / colSize);
int ySegNum = int((float)colNum / rowSize);
int marginBlockNum = blockNum - xSegNum * ySegNum;
/*
To maximize the number of resulting sub-block, we have to
make use of the margin block
*/
if (blockNum > 1 && marginBlockNum > 0) {
int margin = 0;
int step = 0;
if (rowNum < colNum) {
margin = int(((float)marginBlockNum / blockNum) * colNum);
step = (int)ceil((float)rowNum / marginBlockNum);
x1 = 0;
y1 = yMax - margin + 1;
x2 = step - 1;
y2 = yMax;
while (x2 <= xMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (x2 == xMax)
break;
x1 = x2 + 1;
x2 = x1 + step - 1;
if (x2 > xMax)
x2 = xMax;
}
yMax -= margin;
}
else {
margin = int(((float)marginBlockNum / blockNum) * rowNum);
step = (int)ceil((float)colNum / marginBlockNum);
x1 = xMax - margin + 1;
y1 = 0;
x2 = xMax;
y2 = step - 1;
while (y2 <= yMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (y2 == yMax)
break;
y1 = y2 + 1;
y2 = y1 + step - 1;
if (y2 > yMax)
y2 = yMax;
}
xMax -= margin;
}
colSize = (int)ceil((float)(xMax + 1) / xSegNum);
rowSize = (int)ceil((float)(yMax + 1) / ySegNum);
}
x1 = 0;
y1 = 0; // upper-left corner
x2 = colSize - 1;
y2 = rowSize - 1; // bottom-right corner
/* the main body of the matrix (after removing the margin block) */
while (x1 <= xMax) {
y1 = 0;
x2 = x1 + colSize - 1;
y2 = y1 + rowSize - 1;
if (x2 > xMax) {
x2 = xMax;
}
while (y2 <= yMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (y2 == yMax)
break;
y1 = y2 + 1;
y2 = y1 + rowSize - 1;
if (y2 > yMax)
y2 = yMax;
CheckNTErrors((nblock <= blockNum),
"Fail to segment the matrix!");
}
x1 = x2 + 1;
}
CheckNTErrors(nitem == rowNum * colNum,
"Fail to segment the matrix!");
return nblock;
}
/*
segment a block into sub-blocks (each block consists of a number of rows)
>> rowNum - number of rows
>> colNum - number of columns
>> blockNum - number of sub-blocks
>> blockIndex - upper-left and bottom-right corners of each sub-block
<< return - the number of resulting sub-blocks
*/
int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex)
{
if (rowNum < blockNum) {
blockIndex[0] = 0;
blockIndex[1] = 0;
blockIndex[2] = rowNum - 1;
blockIndex[3] = colNum - 1;
return 1;
}
int segSize = (int)ceil((float)rowNum / blockNum);
int x1 = 0;
int x2 = x1 + segSize - 1;
int y1 = 0;
int y2 = colNum - 1;
int last = rowNum - 1;
int nblock = 0;
while (x1 <= last) {
x2 = x1 + segSize - 1;
if (x2 > last) {
x2 = last;
}
int * blockInfo = blockIndex + 4 * nblock;
blockInfo[0] = x1;
blockInfo[1] = y1;
blockInfo[2] = x2;
blockInfo[3] = y2;
nblock++;
if (x2 == last)
break;
x1 += segSize;
}
return nblock;
}
} // namespace nts(NiuTrans.Tensor)
...@@ -51,12 +51,12 @@ void RunParallel2D(XPRunner * parallelRunner, void * job, ...@@ -51,12 +51,12 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
CheckNTErrors(jobNum != 0, "TODO!"); CheckNTErrors(jobNum != 0, "TODO!");
/* argument list of the jobs */ /* argument list of the jobs */
XList * jobArgList = new XList(4); XList * jobArgList = new XList(argNum);
va_list ap; va_list ap;
va_start(ap, argNum); va_start(ap, argNum);
for (int i = 0; i < argNum; i++) { for (int i = 0; i < argNum; i++) {
void * p = va_arg(ap, void*); XTensor* p = va_arg(ap, XTensor*);
jobArgList->Add(p); jobArgList->Add(p);
} }
va_end(ap); va_end(ap);
...@@ -77,27 +77,30 @@ void RunParallel2D(XPRunner * parallelRunner, void * job, ...@@ -77,27 +77,30 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
2. other arguments 2. other arguments
*/ */
for (int i = 0; i < jobNum; i++) { for (int i = 0; i < jobNum; i++) {
XList * blockArgs = new XList(argNum + 4); IntList* indexArgs = new IntList(4);
XList * blockArgs = new XList(argNum);
int * blockIndex = indexList + i * 4; int * blockIndex = indexList + i * 4;
blockArgs->Add(blockIndex); indexArgs->Add(blockIndex);
blockArgs->Add(blockIndex + 1); indexArgs->Add(blockIndex + 1);
blockArgs->Add(blockIndex + 2); indexArgs->Add(blockIndex + 2);
blockArgs->Add(blockIndex + 3); indexArgs->Add(blockIndex + 3);
for (int j = 0; j < argNum; j++) for (int j = 0; j < argNum; j++)
blockArgs->Add(jobArgList->GetItem(j)); blockArgs->Add(jobArgList->GetItem(j));
args->Add(blockArgs); args->Add((XTensor*)indexArgs);
jobs->Add((void*)job); args->Add((XTensor*)blockArgs);
jobs->Add((XTensor*)job);
} }
args->count = nblock;
jobs->count = nblock; jobs->count = nblock;
/* single job */ /* single job */
if (jobNum == 1) if (jobNum == 1)
((TFunction)job)((XList*)args->GetItem(0)); ((TFunction)job)(args);
/* multiple jobs */ /* multiple jobs */
else else
parallelRunner->Run(jobs, args); parallelRunner->Run(jobs, args);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
*/
#include "DataSet.h"
#include "StringUtil.h"
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
using namespace nts;
const int PAD = 0;
/*
load data from the file to the buffer
the data format:
each line: field1 ||| field2 ||| ... ||| fieldn
i.e. fields are separated by `|||` and tokens are separated by space
this will sort the data in ascending order if `sortBuffer` is True
*/
void DataSet::LoadDataToBuffer()
{
string line;
buffer.clear();
bufferUsed = 0;
const string tokenDelimiter = " ";
const string fieldsDelimiter = "|||";
int counter = bufferSize;
while (getline(*fp, line) && counter) {
auto fields = Split<string>(line, fieldsDelimiter);
for (const auto& field : fields) {
auto elements = Split<int>(field, tokenDelimiter);
buffer.emplace_back(elements);
}
counter--;
}
if (fp->eof()) {
fp->seekg(fp->beg);
// LOG("start a new epoch");
}
//if (sortBuffer) {
// auto myCompare = [](const auto& a, const auto& b) {
// return a.first.size() < b.first.size();
// };
// sort(buffer.begin(), buffer.end(), myCompare);
//}
}
/*
select a field and generate a mini-batch by indices
>>> t - a tensor to store the batch
>>> indices - the indices of data
>>> offset - the number of indices
>>> field - indicates which field is selected
this will combine selected elements into a padded batch
*/
void DataSet::LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field)
{
if (bufferUsed == bufferSize || bufferUsed == 0) {
LoadDataToBuffer();
}
/* get the maximum length in a mini-batch */
size_t maxLength = 0;
for (size_t i = 0; i < batchSize; ++i) {
int idx = *(indices + i);
maxLength = max(maxLength, buffer[idx * fieldNum + field].size());
}
int* data = new int[maxLength * batchSize];
memset(data, 0, maxLength * batchSize);
size_t cur = 0;
for (size_t i = 0; i < batchSize; ++i) {
size_t next = cur + maxLength;
int idx = *(indices + i);
for (int v : buffer[idx * fieldNum + field]) {
data[cur++] = v;
}
/* pad zeros */
while (cur < next) {
data[cur++] = 0;
}
}
InitTensor2D(&t, batchSize, maxLength, X_INT);
t.SetData(data, maxLength * batchSize);
bufferUsed += batchSize;
delete[] data;
}
/*
the constructor of DataSet
>>> fname - path of the data file
>>> paraFieldNum - the number of different fields
>>> paraBufferSize - size of each field in the buffer
the real size of buffer is `bufferSize * fieldNum`
*/
DataSet::DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize)
{
fp = new ifstream(fname);
fieldNum = paraFieldNum;
bufferSize = paraBufferSize;
bufferUsed = 0;
buffer.reserve(bufferSize * fieldNum);
CheckNTErrors(fp, "unable to open the file: %s", fname);
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#include <vector>
using namespace std;
using BatchType = vector<vector<int>>;
using BucketType = vector<vector<int>>;
using BufferType = vector<vector<int>>;
namespace nts { // namespace nts(NiuTrans.Tensor)
/* A `DataSet` is associated with a file which contains variable length data.*/
class DataSet {
private:
/* the data buffer */
BufferType buffer;
/* the pointer to file stream */
ifstream* fp;
/* number of different fields */
size_t fieldNum;
/* size of the data buffer */
size_t bufferSize;
/* size of used data in buffer */
size_t bufferUsed;
/* load data from a file to the buffer */
void LoadDataToBuffer();
public:
/* sort data in the buffer */
virtual void Sort(){};
/* modify data in the buffer */
virtual void Process(){};
/* group data to buckets */
virtual BucketType Bucketing() { return BucketType(); };
/* generate a mini-batch */
void LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field);
/* constructor */
explicit DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize);
};
} // namespace nts(NiuTrans.Tensor)
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-02
*/
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <vector>
#include <cstdio>
#include <fstream>
#include <unordered_map>
using namespace std;
using namespace nts;
/*
* A `Embedding` is associated with an embeddings table(embedNum * embedDim).
* The table can be loaded from a file or constructed from an instance.
*/
class Embedding {
public:
XTensor embeddingTable;
public:
/* save the embeddings table to a file */
void Dump(const char* fname);
/* looks up ids in a list of embedding tensors */
XTensor EmbeddingLookup(const XTensor& ids);
/* load an embeddings table from a file */
explicit Embedding(const char* fname);
/* construct table from an XTensor instance */
explicit Embedding(XTensor& table);
/* construct table from a array instance */
explicit Embedding(const float* p, size_t embedNum, size_t embedDim);
};
#endif // __EMBEDDING_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "StringUtil.h"
namespace nts {
/* split string by delimiter, this will return indices of all sub-strings */
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
{
vector<pair<int, int>> fields;
if (delimiter.length() == 0) {
fields.emplace_back(0, s.length());
return fields;
}
int pos = 0;
int start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
fields.emplace_back(start, pos);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
fields.emplace_back(start, s.length());
}
return fields;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef __STRING_UTIL_H__
#define __STRING_UTIL_H__
#include <cstdlib>
#include <string>
#include <utility>
#include <vector>
using namespace std;
namespace nts {
/* Splits a string based on the given delimiter string. Each pair in the
* returned vector has the start and past-the-end positions for each of the
* parts of the original string. Empty fields are not represented in the output.
*/
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
/* Splits the given string and converts each part to the given T. */
template <typename T>
vector<T> Split(const string& s, const string& delimiter);
template <>
inline vector<string> Split(const string& s, const string& delimiter)
{
vector<string> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(s.substr(p.first, p.second - p.first));
}
return fields;
}
template <>
inline vector<int> Split(const string& s, const string& delimiter)
{
vector<int> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<int64_t> Split(const string& s, const string& delimiter)
{
vector<int64_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<float> Split(const string& s, const string& delimiter)
{
vector<float> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtod(s.data() + p.first, nullptr));
}
return fields;
}
template <>
inline vector<uint8_t> Split(const string& s, const string& delimiter)
{
vector<uint8_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<bool> Split(const string& s, const string& delimiter)
{
vector<bool> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(
static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
}
return fields;
}
} // namespace nts
#endif // __STRING_UTIL_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "Vocabulary.h"
#include "StringUtil.h"
#include <algorithm>
#include <fstream>
#include <iterator>
#include <map>
#include <string>
#include <vector>
using namespace nts;
/* initialize the mappings with some special tokens */
void nts::Vocabulary::Init()
{
const std::map<std::string, int> init{
{ "<UNK>", 0 },
{ "<PAD>", 1 },
{ "<SOS>", 2 },
{ "<EOS>", 3 },
};
for (const auto& v : init) {
token2ID[v.first] = v.second;
id2Token[v.second] = v.first;
}
}
/*
load vocabulary from disk
format(each line): token<\n>
*/
nts::Vocabulary::Vocabulary(const char* vocabPath)
{
Init();
std::ifstream file(vocabPath);
string line;
int index = int(token2ID.size());
while (getline(file, line)) {
size_t pos = line.find("\n");
string token = line.substr(0, pos);
token2ID[token] = index;
id2Token[index++] = token;
}
currentSize = int(token2ID.size());
}
/*
build vocabulary from the source file
>> srcPath - the path of source file
>> minFreq - the user specified minimum count for a single token
>> maxSize - the user specified maxmum size of a vocabulary
this will sort tokens by frequency and map the most uncommon tokens to <UNK>
*/
nts::Vocabulary::Vocabulary(const char* srcPath, int minFreq, int maxSize)
{
Init();
using Dict = vector<pair<string, int>>;
/* count word and their frequency */
std::ifstream file(srcPath);
map<string, int> dict;
string token;
while (file >> token) {
dict[token] += 1;
}
/* sort tokens by frequency */
Dict dictFlatten;
copy(dict.begin(), dict.end(), back_inserter<Dict>(dictFlatten));
auto myCompare = [](const auto& a, const auto& b) {
return a.second < b.second;
};
sort(dictFlatten.begin(), dictFlatten.end(), myCompare);
/* skip token whose frequency is out of top-maxSize or less than minFreq */
size_t index = token2ID.size();
size_t validSize = maxSize > dict.size() ? maxSize : dict.size();
for (const auto& tokenID : dictFlatten) {
if (validSize && tokenID.second > minFreq &&
token2ID.find(tokenID.first) == token2ID.end()) {
id2Token[int(index)] = tokenID.first;
token2ID[tokenID.first] = int(index);
index++;
validSize--;
}
}
currentSize = int(token2ID.size());
}
/* insert a token to the vocabulary */
void nts::Vocabulary::Insert(string token)
{
if (token2ID.find(token) != token2ID.end()) {
token2ID[token] = currentSize;
id2Token[currentSize++] = token;
}
}
/* save the vocabulary to a file */
void nts::Vocabulary::Dump(const char* vocabPath)
{
std::ofstream file(vocabPath);
for (const auto& tokenID : token2ID)
file << tokenID.first << "\t" << tokenID.second << "\n";
}
/*
maps tokens to integers
>>> tokens - a list of tokens
<<< indices - a list of corresponding indices
notices that this will map OOV to UNK
*/
vector<int> nts::Vocabulary::Token2ID(vector<string> tokens)
{
vector<int> indices;
indices.reserve(tokens.size());
for (auto str : tokens) {
if (token2ID.find(str) == token2ID.end())
indices.emplace_back(token2ID["<UNK>"]);
else
indices.emplace_back(token2ID[str]);
}
return indices;
}
/*
maps integers to tokens
>>> indices - a list of indices
<<< tokens - a list of corresponding tokens
notices that this will throw a error if the id is not found
*/
vector<string> nts::Vocabulary::ID2Token(vector<int> ids)
{
vector<string> tokens;
tokens.reserve(ids.size());
for (auto id : ids) {
CheckNTErrors(id2Token.find(id) != id2Token.end(), "id not found!");
tokens.emplace_back(id2Token[id]);
}
return tokens;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __VOCABULARY_H__
#define __VOCABULARY_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
* A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
* Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
* For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
*/
class Vocabulary {
private:
/* current size of the vocabulary */
int currentSize = 0;
/* initialize mappings with some special tokens */
void Init();
public:
/* a dict maps tokens to indices */
unordered_map<string, int> token2ID;
/* a dict maps indices to tokens */
unordered_map<int, string> id2Token;
/* save the vocabulary to a file */
void Dump(const char* vocabPath);
/* append token to the vocabulary */
void Insert(string token);
/* maps integers to tokens */
vector<string> ID2Token(vector<int> ids);
/* maps tokens to integers */
vector<int> Token2ID(vector<string> tokens);
/* constructor */
/* load the vocabulary from a file */
explicit Vocabulary(const char* vocabPath);
/* built the vocabulary from the source file */
explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
};
} // namespace nts(NiuTrans.Tensor)
#endif // __VOCABULARY_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef _DATA_READER_H_
#define _DATA_READER_H_
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
* A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
* Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
* For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
*/
class Vocabulary {
private:
/* current size of the vocabulary */
int currentSize = 0;
/* initialize mappings with some special tokens */
void Init();
public:
/* a dict maps tokens to indices */
unordered_map<string, int> token2ID;
/* a dict maps indices to tokens */
unordered_map<int, string> id2Token;
Vocabulary() = delete;
Vocabulary(Vocabulary&&) = delete;
/* load the vocabulary from a file */
explicit Vocabulary(const char* vocabPath);
/* built the vocabulary from the source file */
explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
/* append token to the vocabulary */
void Insert(string token);
/* save the vocabulary to a file */
void Dump(const char* vocabPath);
/* maps tokens to integers */
vector<int> Token2ID(vector<string> tokens);
/* maps integers to tokens */
vector<string> ID2Token(vector<int> ids);
};
/*
* A DataSet maintains a sequences buffer.
* An instance is tied to a specified file and buffer size.
*/
class DataSet {
public:
/* number of buffered sequences */
int bufferSize;
/* the pointer to file stream */
ifstream* fp;
/* source-side vocabulary */
Vocabulary* srcVocab;
/* target-side vocabulary */
Vocabulary* tgtVocab;
/* sequences buffer */
vector<pair<vector<int>, vector<int>>> dataBuf;
DataSet(const DataSet&) = delete;
DataSet(DataSet&&) = delete;
/* constructor */
explicit DataSet(const char* fname, const char* srcVocabPath, const char* tgtVocabPath, int paraBufferSize, bool isShuffled)
{
bufferSize = paraBufferSize;
srcVocab = new Vocabulary(srcVocabPath);
tgtVocab = new Vocabulary(tgtVocabPath);
string trainFN = fname;
#ifndef WIN32
if (isShuffled) {
Shuffle(trainFN);
trainFN += ".random";
}
#endif
fp = new ifstream();
fp->open(trainFN);
CheckNTErrors(fp, "unable to open %s", fname);
}
/* destructor */
~DataSet()
{
fp->close();
}
/* loading data to the buffer */
void LoadDataToBuf();
};
/*
* A DataManager splits buffer to buckets and generates batches.
* It must be associated with a dataset.
*/
class DataManager {
private:
/* size of a mini-batch */
int batchSize;
/* the maximum length of a sequence */
int maxSeqLen;
/* the minimum length of a sequence */
int minSeqLen;
/* index of current bucket */
int bucketIter = 0;
/* the associated data set */
DataSet* dataSet;
/* buckets positon in buffered sequences */
vector<vector<int>> buckets;
DataManager() = delete;
DataManager(DataManager&&) = delete;
DataManager(const DataManager&) = delete;
/* splits sequences to buckets */
void BucketSeqByLen(int batchSize);
public:
/* constructor */
explicit DataManager(const char* fname,
const char* srcVocab, const char* tgtVocab,
int paraBufferSize, bool isShuffled,
int paraMaxSeqLen, int paraMinSeqLen, int paraBatchSize)
{
batchSize = paraBatchSize;
maxSeqLen = paraMaxSeqLen;
minSeqLen = paraMinSeqLen;
dataSet = new DataSet(fname, srcVocab, tgtVocab, paraBufferSize, isShuffled);
}
/* destructor */
~DataManager()
{
delete dataSet;
}
/* loads a mini-batch */
void LoadBatch(XTensor* batchEnc,
XTensor* paddingEnc, XTensor* batchDec,
XTensor* paddingDec, XTensor* label,
int devID, XMem* mem);
};
/* shuffle a file and pipe it to an output file */
void Shuffle(const char* srcFile);
} // namespace nts(NiuTrans.Tensor)
#endif // _DATA_READER_H_
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "T2TDataReader.h"
#include "StringUtil.h"
#include <algorithm>
#include <cstdio>
#include <fstream>
void nts::T2TDataSet::Process()
{
}
/* split buffer to buckets */
void nts::T2TDataManager::BucketSeqByLen(int batchSize)
{
t2tDataSet->LoadDataToBuf();
auto myCompare = [](const auto& a, const auto& b) {
return a.size() < b.size();
};
sort(dataSet->dataBuf.begin(), dataSet->dataBuf.end(), myCompare);
int realBatchSize = 0;
for (int i = 0; i < dataSet->dataBuf.size(); ++i) {
/* the first allocation */
if (!bucketBoundary.size()) {
bucketBoundary.emplace_back(1);
}
/* append the current sentence to bucket if the total size is suitable */
int seqLen = dataSet->dataBuf[i].first.size();
int seqNum = bucketBoundary.back().size();
if ((seqNum + 1) * seqLen < batchSize && seqLen <= maxSeqLen && seqLen >= minSeqLen) {
bucketBoundary.back().push_back(i);
}
/* allocate a new bucket */
else {
bucketBoundary.emplace_back(1);
}
}
/* drop the last bucket if its size is too small */
if (bucketBoundary.back().size() * dataSet->dataBuf.back().first.size() < batchSize / 2)
bucketBoundary.pop_back();
}
/*
load a batch of sequences (for MT)
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
void nts::T2TDataManager::LoadBatchMT(XTensor* batchEnc,
XTensor* paddingEnc, XTensor* batchDec,
XTensor* paddingDec, XTensor* label,
int devID, XMem* mem, bool isTraining)
{
if (bucketIter == bucketBoundary.size() - 1) {
BucketSeqByLen(batchSize);
bucketIter = 0;
}
auto bucket = bucketBoundary[bucketIter++];
int sentenceCount = bucket.size();
int maxEnc = dataSet->dataBuf[bucket.back()].first.size();
int maxDec = dataSet->dataBuf[bucket.back()].second.size();
InitTensor2D(batchEnc, sentenceCount, maxEnc, X_INT, devID, mem);
InitTensor2D(paddingEnc, sentenceCount, maxEnc, X_FLOAT, devID, mem);
InitTensor2D(batchDec, sentenceCount, maxDec, X_INT, devID, mem);
InitTensor2D(paddingDec, sentenceCount, maxDec, X_FLOAT, devID, mem);
InitTensor2D(label, sentenceCount, maxDec, X_INT, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
label->SetZeroAll();
int* labelValues = new int[batchDec->unitNum];
int* batchEncValues = new int[batchEnc->unitNum];
int* batchDecValues = new int[batchDec->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[batchEnc->unitNum];
MTYPE* paddingDecOffsets = new MTYPE[batchDec->unitNum];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
memset(paddingEncOffsets, 0, sizeof(int) * batchEnc->unitNum);
memset(paddingDecOffsets, 0, sizeof(int) * batchDec->unitNum);
memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
int srcIter = 0;
int tgtIter = 0;
for (const auto& index : bucket) {
int srcOffset = srcIter + dataSet->dataBuf[bucket.back()].first.size();
int tgtOffset = tgtIter + dataSet->dataBuf[bucket.back()].second.size();
for (const auto& src : dataSet->dataBuf[index].first)
batchEncValues[srcIter++] = src;
for (const auto& src : dataSet->dataBuf[index].second)
batchDecValues[tgtIter++] = src;
while (srcIter < srcOffset)
paddingEncOffsets[srcIter++] = 1.0F;
while (tgtIter < tgtOffset)
paddingDecOffsets[tgtIter++] = 1.0F;
}
batchEnc->SetData(batchDecValues, batchEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingEnc->SetData(paddingEncOffsets, batchEnc->unitNum);
paddingDec->SetData(paddingDecOffsets, batchDec->unitNum);
label->SetData(labelValues, label->unitNum);
delete[] batchEncValues;
delete[] batchDecValues;
delete[] labelValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef _T2T_DATA_READER_H_
#define _T2T_DATA_READER_H_
#include "../XTensor.h"
#include "DataReader.h"
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
class T2TDataSet : public DataSet {
public:
/* source-side vocabulary */
Vocabulary* srcVocab;
/* target-side vocabulary */
Vocabulary* tgtVocab;
/* sequences buffer */
vector<pair<vector<int>, vector<int>>> t2tDataBuf;
/* constructor */
explicit T2TDataSet(const char* fname, int paraBufferSize,
const char* srcVocabPath, const char* tgtVocabPath, bool isShuffled)
: DataSet(fname, paraBufferSize, isShuffled)
{
srcVocab = new Vocabulary(srcVocabPath);
tgtVocab = new Vocabulary(tgtVocabPath);
}
/* destructor */
~T2TDataSet()
{
delete srcVocab;
delete tgtVocab;
}
/* loading setences to the buffer */
void LoadDataToBuf() override;
/* post processing of sequences after loading */
void Process() override;
};
class T2TDataManager : public DataManager {
public:
/* the associated data set */
T2TDataSet* t2tDataSet;
/* constructor */
explicit T2TDataManager(bool isShuffled, const char* fname,
int paraBatchSize, int paraMaxSeqLen,
int paraMinSeqLen, int paraBufferSize,
const char* srcVocabPath, const char* tgtVocabPath)
: DataManager(fname, paraBufferSize, isShuffled, paraMaxSeqLen, paraMinSeqLen, paraBatchSize)
{
maxSeqLen = paraMaxSeqLen;
minSeqLen = paraMinSeqLen;
t2tDataSet = new T2TDataSet(fname, paraBufferSize, srcVocabPath, tgtVocabPath, isShuffled);
}
/* */
void LoadBatchMT(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* label, int devID, XMem* mem,
bool isTraining);
};
} // namespace nts(NiuTrans.Tensor)
#endif // _T2T_DATA_READER_H_
我 你 他 <EOS> ||| <SOS> I You He <EOS>
我 你 <EOS> ||| <SOS> I You <EOS>
\ No newline at end of file
\ No newline at end of file
I
You
He
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论