Commit 72c9551b by huchi

refactor XList using template

parent 47c31021
#include "../../source/tensor/data/DataSet.h"
#include <fstream>
#include <iostream>
#include <string>
#include "../tensor/core/arithmetic/MatrixMul.h"
using namespace nts;
void TestDataManager() {
DataSet dataSet("src.txt", 2, 100);
XTensor src, tgt;
enum FIELD {
srcField = 0,
tgtField = 1,
};
const int indices[] = { 0, 1 };
dataSet.LoadBatch(src, indices, sizeof(indices) / sizeof(*indices), srcField);
dataSet.LoadBatch(tgt, indices, sizeof(indices) / sizeof(*indices), tgtField);
//
// tgt.Dump(stderr);
// src.Dump(stderr);
//XListV2<int> list(10);
//int* a = new int[10]{1,2,3,4,5,6,7,8,9};
//list.Add(a);
//auto x = list.Get(0);
//cout << x[0] << endl;
//list.Remove(0);
//auto y = list.Get(0);
//cout << x[0] << endl;
//delete[] a;
XList list(10);
XTensor a,b,c;
InitTensor2D(&a, 2, 2);
InitTensor2D(&b, 2, 2);
InitTensor2D(&c, 2, 2);
float arr[] = { 1., 2., 3., 4. };
a.SetData(arr, 4);
b.SetData(arr, 4);
//c.SetZeroAll();
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
c.Dump(stderr);
CharList str(10);
char* s = new char(10);
for (int i = 0; i < 9; ++i) {
s[i] = i + 'a';
}
s[9] = 0;
str.Add(s);
cout << str.Get(0);
}
int main()
{
TestDataManager();
return 0;
}
\ No newline at end of file
......@@ -478,7 +478,7 @@ split a string
>> items - splitting result
<< return - how many items are there
*/
int SplitALine(char * inputString, const char * seperator, XList * items)
int SplitALine(char * inputString, const char * seperator, CharList * items)
{
items->Clear();
......@@ -532,7 +532,7 @@ get device ids for the given device information
*/
int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
{
XList * terms = new XList(1);
CharList* terms = new CharList(1);
SplitALine(devInfo, " ", terms);
for(int i = 0; i < terms->count; i++){
......
......@@ -301,8 +301,8 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
return;
XList list(2);
list.Add(t1);
list.Add(t2);
list.Add((XTensor*)t1);
list.Add((XTensor*)t2);
MakeLink(&list, h, id);
}
......@@ -321,9 +321,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
return;
XList list(3);
list.Add(t1);
list.Add(t2);
list.Add(t3);
list.Add((XTensor*)t1);
list.Add((XTensor*)t2);
list.Add((XTensor*)t3);
MakeLink(&list, h, id);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -15,32 +15,31 @@
* limitations under the License.
*/
/*
/*
*
* Implementation of list that keeps data items
* Implementation of template list that keeps data items
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
* The first coding job this year!
* $Created by: HU Chi (huchinlp@foxmail.com)
*
*/
#include "XMem.h"
#include "XGlobal.h"
#ifndef __XLIST_H__
#define __XLIST_H__
#include "XMem.h"
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
typedef int (* ListCompare)(const void * item1, const void * item2);
namespace nts {
/* the XList class */
class XList
{
/* the XListType class */
template <typename T>
class XListType {
public:
typedef int (*ListCompare)(const T* item1, const T* item2);
/* data items */
void ** items;
T** items;
/* number of items */
int count;
......@@ -49,56 +48,350 @@ public:
int maxNum;
/* the memory pool for data array allocation */
XMem * mem;
XMem* mem;
/* indicates whether data items are integers */
bool isIntList;
public:
/* constructor */
XList();
XListType();
/* constructor */
XList(int myMaxNum, bool isIntListOrNot = false);
XListType(int myMaxNum, bool isIntListOrNot = false);
/* constructor */
XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
XListType(int myMaxNum, XMem* myMem, bool isIntListOrNot = false);
/* de-constructor */
~XList();
/* utilities */
void Create(int myMaxNum, XMem * myMem);
void Add(const void * item);
void Add(void ** inputItems, int inputItemCount);
void AddList(XList * l);
void AddInt(int i);
void Insert(int pos, void * item);
void * GetItem(int i) const;
int GetItemInt(int i);
void SetItem(int i, void * item);
void SetItemInt(int i, int item);
int FindFirst(void * item);
~XListType();
/* add an item into the list */
void Add(T* item);
/* add a number of items into the list */
void Add(T** inputItems, int inputItemCount);
/* append a list to the current list */
void AddList(XListType* l);
/* insert an item to the given position of the list */
void Insert(int pos, T* item);
/* get the item at position i */
T* GetItem(int i) const;
/* set the item at position i */
void SetItem(int i, T* item);
/* find the position of the first matched item */
int FindFirst(T* item);
/* clear the data array */
void Clear();
void ClearStringList();
/* sort the list */
void Sort(int itemSize, ListCompare comp);
/* reverse the list */
void Reverse();
/* remove the item at position i */
void Remove(int i);
XList * Copy(XMem * myMem);
/* copy the list */
XListType* Copy(XMem* myMem);
/* shuffle the list */
void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */
_XINLINE_ void * Get(int i) {return GetItem(i);};
_XINLINE_ int GetInt(int i) {return GetItemInt(i);};
_XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
_XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
T* Get(int i) { return GetItem(i); };
void Set(int i, T* item) { SetItem(i, item); };
};
extern XList NULLList;
/* constructor */
template <typename T>
XListType<T>::XListType()
{
mem = NULL;
maxNum = 0;
count = 0;
items = NULL;
isIntList = false;
}
/*
constructor
>> myMaxNum - maximum number of items to keep
>> isIntListOrNot - specify if the list keeps int items
*/
template <typename T>
XListType<T>::XListType(int myMaxNum, bool isIntListOrNot)
{
mem = NULL;
maxNum = myMaxNum;
count = 0;
items = new T*[myMaxNum];
isIntList = isIntListOrNot;
}
/*
constructor
>> myMaxNum - maximum number of items to keep
>> myMem - the memory pool used for data allocation
>> isIntListOrNot - specify if the list keeps int items
*/
template <typename T>
XListType<T>::XListType(int myMaxNum, XMem* myMem, bool isIntListOrNot)
{
mem = myMem;
maxNum = myMaxNum;
count = 0;
items = (T**)mem->Alloc(mem->devID, sizeof(T*) * maxNum);
isIntList = isIntListOrNot;
}
/* de-constructor */
template <typename T>
XListType<T>::~XListType()
{
if (isIntList) {
for (int i = 0; i < count; i++) {
int* p = (int*)items[i];
delete[] p;
}
}
if (mem == NULL)
delete[] items;
}
/*
add an item into the list
>> item - pointer to the item
*/
template <typename T>
void XListType<T>::Add(T* item)
{
if (count == maxNum) {
T** newItems;
if (mem == NULL)
newItems = new T*[maxNum * 2 + 1];
else
newItems = (T**)mem->Alloc(mem->devID, sizeof(T*) * (maxNum * 2 + 1));
memcpy(newItems, items, sizeof(T*) * maxNum);
if (mem == NULL)
delete[] items;
items = newItems;
maxNum = maxNum * 2 + 1;
}
items[count++] = item;
}
/*
add a number of items into the list
>> inputItems - pointer to the array of items
>> inputItemCount - number of input items
*/
template <typename T>
void XListType<T>::Add(T** inputItems, int inputItemCount)
{
if (count + inputItemCount >= maxNum) {
int newMaxNum = (count + inputItemCount) * 2 + 1;
T** newItems;
if (mem == NULL)
newItems = new T*[newMaxNum];
else
newItems = (T**)mem->Alloc(mem->devID, sizeof(T*) * newMaxNum);
memcpy(newItems, items, sizeof(T*) * maxNum);
if (mem == NULL)
delete[] items;
items = newItems;
maxNum = newMaxNum;
}
memcpy(items + count, inputItems, sizeof(T*) * inputItemCount);
count += inputItemCount;
}
/*
append a list to the current list
>> l - the list we use to append
*/
template <typename T>
void XListType<T>::AddList(XListType* l)
{
Add(l->items, l->count);
}
/*
insert an item to the given position of the list
>> pos - the position
>> item - the item for insertion
*/
template <typename T>
void XListType<T>::Insert(int pos, T* item)
{
if (count == maxNum) {
T** newItems;
if (mem == NULL)
newItems = new T*[maxNum * 2 + 1];
else
newItems = (T**)mem->Alloc(mem->devID, sizeof(T*) * (maxNum * 2 + 1));
memcpy(newItems, items, sizeof(T*) * maxNum);
if (mem == NULL)
delete[] items;
items = newItems;
maxNum = maxNum * 2 + 1;
}
for (int i = count - 1; i >= pos; i--)
items[i + 1] = items[i];
items[pos] = item;
count++;
}
/* get the item at position i */
template <typename T>
T* XListType<T>::GetItem(int i) const
{
CheckNTErrors(i >= -1 && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
if (i == -1)
return items[count - 1];
else
return items[i];
}
/* set the item at position i */
template <typename T>
void XListType<T>::SetItem(int i, T* item)
{
if (i >= 0 && i < count)
items[i] = item;
}
/*
find the position of the first matched item
>> item - the item for matching
<< the position where we hit the item (if any)
*/
template <typename T>
int XListType<T>::FindFirst(T* item)
{
for (int i = 0; i < count; i++) {
if (item == items[i])
return i;
}
return -1;
}
/* clear the data array */
template <typename T>
void XListType<T>::Clear()
{
if (isIntList) {
for (int i = 0; i < count; i++) {
delete[](int*) items[i];
}
count = 0;
} else
count = 0;
}
/*
sort the list
>> itemSize - size of an item
>> comp - the comparison function used in sorting
*/
template <typename T>
void XListType<T>::Sort(int itemSize, ListCompare comp)
{
qsort(items, count, itemSize, comp);
}
/* reverse the list */
template <typename T>
void XListType<T>::Reverse()
{
int half = count / 2;
for (int i = 0; i < half; i++) {
T* tmp = items[i];
items[i] = items[count - i - 1];
items[count - i - 1] = tmp;
}
}
/* remove the item at position i */
template <typename T>
void XListType<T>::Remove(int i)
{
if (i >= count || i < 0)
return;
memcpy(items + i, items + i + 1, sizeof(T*) * (count - i - 1));
count--;
}
/* end of the nts (NiuTrans.Tensor) namespace */
#endif
/*
copy the list
>> myMem - memory pool used for allocating the data in the new list
<< hard copy of the list
*/
template <typename T>
XListType<T>* XListType<T>::Copy(XMem* myMem)
{
XListType<T>* newList = new XListType<T>(maxNum, myMem);
for (int i = 0; i < count; i++) {
newList->Add(GetItem(i));
}
return newList;
}
/*
shuffle the list
>> nround - number of rounds for shuffling
>> beg - where we start
>> len - how many items are used in shuffling
*/
template <typename T>
void XListType<T>::Shuffle(int nround, int beg, int len)
{
if (beg < 0) {
beg = 0;
len = count;
}
if (beg + len > count)
return;
srand((unsigned int)time(NULL));
for (int k = 0; k < nround; k++) {
/* FisherCYates shuffle */
for (int i = 0; i < len; i++) {
float a = (float)rand() / RAND_MAX;
size_t j = (unsigned int)(a * (i + 1));
T* t = items[beg + j];
items[beg + j] = items[beg + i];
items[beg + i] = t;
}
}
}
struct XTensor;
/* typedef for list */
typedef XListType<int> IntList;
typedef XListType<char> CharList;
typedef XListType<long> LongList;
typedef XListType<float> FloatList;
typedef XListType<short> ShortList;
typedef XListType<XTensor> XList;
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif // __XLIST_H__
......@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
isJobQueue = true;
jobDequeuerArgs->Clear();
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
// warning: this may cause unknown error
jobDequeuerArgs->Add((XTensor*)this);
jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs;
......
......@@ -40,15 +40,21 @@ argument7: matrix c (c=a*b*\alpha + c*beta)
*/
void _MatrixMul2DMultiTheading(XList * args)
{
int x1 = *(int*)args->GetItem(0);
int y1 = *(int*)args->GetItem(1);
int x2 = *(int*)args->GetItem(2);
int y2 = *(int*)args->GetItem(3);
XTensor * a = (XTensor*)args->GetItem(4);
XTensor * b = (XTensor*)args->GetItem(5);
XTensor * c = (XTensor*)args->GetItem(6);
DTYPE alpha = *(DTYPE*)args->GetItem(7);
DTYPE beta = *(DTYPE*)args->GetItem(8);
CheckNTErrors(args->count == 2, "invalid argument number!");
IntList * indexArgs = (IntList*)args->GetItem(0);
XList * matrixArgs = (XList*)args->GetItem(1);
CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
XTensor * a = matrixArgs->GetItem(0);
XTensor * b = matrixArgs->GetItem(1);
XTensor * c = matrixArgs->GetItem(2);
DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
int x1 = *(indexArgs->GetItem(0));
int y1 = *(indexArgs->GetItem(1));
int x2 = *(indexArgs->GetItem(2));
int y2 = *(indexArgs->GetItem(3));
#ifdef FAST_MATRIX
int am = a->dimSize[1];
......
......@@ -139,11 +139,11 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
/* tensor connections */
XList list(5);
list.Add(&input);
list.Add(&mean);
list.Add(&var);
list.Add(&a);
list.Add(&b);
list.Add((XTensor*)&input);
list.Add((XTensor*)&mean);
list.Add((XTensor*)&var);
list.Add((XTensor*)&a);
list.Add((XTensor*)&b);
XLink::MakeLink(&list, &output, MATH_NORMALIZE);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, epsilon);
......
......@@ -228,9 +228,9 @@ XTensor CopyIndexed(const XTensor & s, int dim,
_CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
XList list(3);
list.Add(&s);
list.Add(&srcIndex);
list.Add(&tgtIndex);
list.Add((XTensor*)&s);
list.Add((XTensor*)&srcIndex);
list.Add((XTensor*)&tgtIndex);
/* tensor connection */
XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
......
......@@ -148,8 +148,8 @@ concatenate two tensors along a given dimension
void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim)
{
XList smalls(2);
smalls.Add(smallA);
smalls.Add(smallB);
smalls.Add((XTensor*)smallA);
smalls.Add((XTensor*)smallB);
_Concatenate(&smalls, big, dim);
}
......@@ -169,8 +169,8 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
XList smalls(2);
smalls.Add(&smallA);
smalls.Add(&smallB);
smalls.Add((XTensor*)&smallA);
smalls.Add((XTensor*)&smallB);
bool uniform = true;
for (int i = 1; i < smalls.count; i++) {
......
......@@ -85,12 +85,12 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
}
}
else {
XList * sourceArrays = new XList(smalls->count);
CharList * sourceArrays = new CharList(smalls->count);
int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
sourceArrays->Add(tensor->data);
sourceArrays->Add((char*)tensor->data);
}
_MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);
......
......@@ -376,8 +376,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
big.SetTMPFlag();
XList smalls(2);
smalls.Add(&smallA);
smalls.Add(&smallB);
smalls.Add((XTensor*)&smallA);
smalls.Add((XTensor*)&smallB);
/* call _Merge function */
_Merge(&smalls, &big, whereToMerge);
......
......@@ -34,7 +34,7 @@ merge data by blocks
>> target - target data array
>> myMem - memory pool
*/
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{
if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA
......
......@@ -71,7 +71,7 @@ merge data by blocks (cuda version)
>> target - target data array
>> myMem - the memory pool
*/
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{
CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
......
......@@ -33,7 +33,7 @@ __global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
/* merge data by blocks (cuda version) */
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
void _CudaMergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
#endif // USE_CUDA
......
......@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* merge data by blocks */
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
void _MergeBlockLists(const CharList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -78,7 +78,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
#endif
}
else {
XList * sourceArrays = new XList(blockNumB);
CharList * sourceArrays = new CharList(blockNumB);
int * blockSizes = new int[blockNumB];
for (int i = 0; i < blockNumA; i++) {
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include <stdarg.h>
#include <math.h>
#include "XMatrixSegment.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel
>> parallelRunner - parallel runner
>> job - the function to run
>> opNum - number of operations
>> rowNum - number of rows
>> colNum - number of columns
>> argNum - number of arguments of the jobs
>> ... - arguments of the jobs
*/
void RunParallel2D(XPRunner * parallelRunner, void * job,
int opNum, int rowNum, int colNum, int argNum, ...)
{
if (rowNum == 0 || colNum == 0)
return;
int jobNum = 1;
if (parallelRunner != NULL && (parallelRunner->method == PRUNNER_SINGLE || parallelRunner->method == PRUNNER_MULTIPLE)) {
if (opNum >= parallelRunner->minimumOPNum * parallelRunner->threadNum)
jobNum = parallelRunner->GetJobNum(rowNum * colNum);
}
CheckNTErrors(jobNum != 0, "TODO!");
/* argument list of the jobs */
XList * jobArgList = new XList(argNum);
va_list ap;
va_start(ap, argNum);
for (int i = 0; i < argNum; i++) {
XTensor* p = va_arg(ap, XTensor*);
jobArgList->Add(p);
}
va_end(ap);
/* prepare the neccesary argument list for parallel processing */
XList * jobs = new XList(jobNum);
XList * args = new XList(jobNum);
int * indexList = new int[jobNum * 4 * 4];
/* segment the matrix into blocks */
int nblock = SegmentTensor2D(rowNum, colNum, jobNum, indexList);
/*
assign jobs
argument rules:
1. block information
2. other arguments
*/
for (int i = 0; i < jobNum; i++) {
XList * blockArgs = new XList(argNum + 4);
int * blockIndex = indexList + i * 4;
blockArgs->Add((XTensor*)blockIndex);
blockArgs->Add((XTensor*)blockIndex + 1);
blockArgs->Add((XTensor*)blockIndex + 2);
blockArgs->Add((XTensor*)blockIndex + 3);
for (int j = 0; j < argNum; j++)
blockArgs->Add(jobArgList->GetItem(j));
args->Add((XTensor*)blockArgs);
jobs->Add((XTensor*)job);
}
args->count = nblock;
jobs->count = nblock;
/* single job */
if (jobNum == 1)
((TFunction)job)((XList*)args->GetItem(0));
/* multiple jobs */
else
parallelRunner->Run(jobs, args);
/* free the memory */
delete[] indexList;
for (int i = 0; i < args->count; i++) {
XList * blockArgs = (XList*)args->GetItem(i);
delete blockArgs;
}
delete args;
delete jobs;
delete jobArgList;
}
/*
segment a block into sub-blocks
>> rowNum - number of rows
>> colNum - number of columns
>> blockNum - number of sub-blocks
>> blockIndex - upper-left and bottom-right corners of each sub-block
<< return - the number of resulting sub-blocks
*/
int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
{
int total = rowNum * colNum;
int rowSize = (int)ceil(sqrt((float)total / blockNum));
int colSize = rowSize;
/* a narrow matrix */
if (rowSize > colNum * 0.9) {
rowSize = colNum;
colSize = (int)ceil((float)rowNum / blockNum);
}
/* a narrow matrix */
if (colSize > rowNum * 0.9) {
colSize = rowNum;
rowSize = (int)ceil((float)colNum / blockNum);
}
if (blockNum == 1) {
colSize = rowNum;
rowSize = colNum;
}
CheckNTErrors((colSize <= rowNum && rowSize <= colNum),
"Too large block!");
int x1, y1, x2, y2;
int xMax = rowNum - 1;
int yMax = colNum - 1;
int nblock = 0, nitem = 0;
int * indexList = blockIndex;
int xSegNum = int((float)rowNum / colSize);
int ySegNum = int((float)colNum / rowSize);
int marginBlockNum = blockNum - xSegNum * ySegNum;
/*
To maximize the number of resulting sub-block, we have to
make use of the margin block
*/
if (blockNum > 1 && marginBlockNum > 0) {
int margin = 0;
int step = 0;
if (rowNum < colNum) {
margin = int(((float)marginBlockNum / blockNum) * colNum);
step = (int)ceil((float)rowNum / marginBlockNum);
x1 = 0;
y1 = yMax - margin + 1;
x2 = step - 1;
y2 = yMax;
while (x2 <= xMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (x2 == xMax)
break;
x1 = x2 + 1;
x2 = x1 + step - 1;
if (x2 > xMax)
x2 = xMax;
}
yMax -= margin;
}
else {
margin = int(((float)marginBlockNum / blockNum) * rowNum);
step = (int)ceil((float)colNum / marginBlockNum);
x1 = xMax - margin + 1;
y1 = 0;
x2 = xMax;
y2 = step - 1;
while (y2 <= yMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (y2 == yMax)
break;
y1 = y2 + 1;
y2 = y1 + step - 1;
if (y2 > yMax)
y2 = yMax;
}
xMax -= margin;
}
colSize = (int)ceil((float)(xMax + 1) / xSegNum);
rowSize = (int)ceil((float)(yMax + 1) / ySegNum);
}
x1 = 0;
y1 = 0; // upper-left corner
x2 = colSize - 1;
y2 = rowSize - 1; // bottom-right corner
/* the main body of the matrix (after removing the margin block) */
while (x1 <= xMax) {
y1 = 0;
x2 = x1 + colSize - 1;
y2 = y1 + rowSize - 1;
if (x2 > xMax) {
x2 = xMax;
}
while (y2 <= yMax) {
int * blockIndex = indexList + nblock * 4;
blockIndex[0] = x1; blockIndex[1] = y1;
blockIndex[2] = x2; blockIndex[3] = y2;
nblock++;
nitem += (y2 - y1 + 1) * (x2 - x1 + 1);
if (y2 == yMax)
break;
y1 = y2 + 1;
y2 = y1 + rowSize - 1;
if (y2 > yMax)
y2 = yMax;
CheckNTErrors((nblock <= blockNum),
"Fail to segment the matrix!");
}
x1 = x2 + 1;
}
CheckNTErrors(nitem == rowNum * colNum,
"Fail to segment the matrix!");
return nblock;
}
/*
segment a block into sub-blocks (each block consists of a number of rows)
>> rowNum - number of rows
>> colNum - number of columns
>> blockNum - number of sub-blocks
>> blockIndex - upper-left and bottom-right corners of each sub-block
<< return - the number of resulting sub-blocks
*/
int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex)
{
if (rowNum < blockNum) {
blockIndex[0] = 0;
blockIndex[1] = 0;
blockIndex[2] = rowNum - 1;
blockIndex[3] = colNum - 1;
return 1;
}
int segSize = (int)ceil((float)rowNum / blockNum);
int x1 = 0;
int x2 = x1 + segSize - 1;
int y1 = 0;
int y2 = colNum - 1;
int last = rowNum - 1;
int nblock = 0;
while (x1 <= last) {
x2 = x1 + segSize - 1;
if (x2 > last) {
x2 = last;
}
int * blockInfo = blockIndex + 4 * nblock;
blockInfo[0] = x1;
blockInfo[1] = y1;
blockInfo[2] = x2;
blockInfo[3] = y2;
nblock++;
if (x2 == last)
break;
x1 += segSize;
}
return nblock;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -51,12 +51,12 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
CheckNTErrors(jobNum != 0, "TODO!");
/* argument list of the jobs */
XList * jobArgList = new XList(4);
XList * jobArgList = new XList(argNum);
va_list ap;
va_start(ap, argNum);
for (int i = 0; i < argNum; i++) {
void * p = va_arg(ap, void*);
XTensor* p = va_arg(ap, XTensor*);
jobArgList->Add(p);
}
va_end(ap);
......@@ -77,27 +77,30 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
2. other arguments
*/
for (int i = 0; i < jobNum; i++) {
XList * blockArgs = new XList(argNum + 4);
IntList* indexArgs = new IntList(4);
XList * blockArgs = new XList(argNum);
int * blockIndex = indexList + i * 4;
blockArgs->Add(blockIndex);
blockArgs->Add(blockIndex + 1);
blockArgs->Add(blockIndex + 2);
blockArgs->Add(blockIndex + 3);
indexArgs->Add(blockIndex);
indexArgs->Add(blockIndex + 1);
indexArgs->Add(blockIndex + 2);
indexArgs->Add(blockIndex + 3);
for (int j = 0; j < argNum; j++)
blockArgs->Add(jobArgList->GetItem(j));
args->Add(blockArgs);
jobs->Add((void*)job);
args->Add((XTensor*)indexArgs);
args->Add((XTensor*)blockArgs);
jobs->Add((XTensor*)job);
}
args->count = nblock;
jobs->count = nblock;
/* single job */
if (jobNum == 1)
((TFunction)job)((XList*)args->GetItem(0));
((TFunction)job)(args);
/* multiple jobs */
else
parallelRunner->Run(jobs, args);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
*/
#include "DataSet.h"
#include "StringUtil.h"
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
using namespace nts;
const int PAD = 0;
/*
load data from the file to the buffer
the data format:
each line: field1 ||| field2 ||| ... ||| fieldn
i.e. fields are separated by `|||` and tokens are separated by space
this will sort the data in ascending order if `sortBuffer` is True
*/
void DataSet::LoadDataToBuffer()
{
string line;
buffer.clear();
bufferUsed = 0;
const string tokenDelimiter = " ";
const string fieldsDelimiter = "|||";
int counter = bufferSize;
while (getline(*fp, line) && counter) {
auto fields = Split<string>(line, fieldsDelimiter);
for (const auto& field : fields) {
auto elements = Split<int>(field, tokenDelimiter);
buffer.emplace_back(elements);
}
counter--;
}
if (fp->eof()) {
fp->seekg(fp->beg);
// LOG("start a new epoch");
}
//if (sortBuffer) {
// auto myCompare = [](const auto& a, const auto& b) {
// return a.first.size() < b.first.size();
// };
// sort(buffer.begin(), buffer.end(), myCompare);
//}
}
/*
select a field and generate a mini-batch by indices
>>> t - a tensor to store the batch
>>> indices - the indices of data
>>> offset - the number of indices
>>> field - indicates which field is selected
this will combine selected elements into a padded batch
*/
void DataSet::LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field)
{
if (bufferUsed == bufferSize || bufferUsed == 0) {
LoadDataToBuffer();
}
/* get the maximum length in a mini-batch */
size_t maxLength = 0;
for (size_t i = 0; i < batchSize; ++i) {
int idx = *(indices + i);
maxLength = max(maxLength, buffer[idx * fieldNum + field].size());
}
int* data = new int[maxLength * batchSize];
memset(data, 0, maxLength * batchSize);
size_t cur = 0;
for (size_t i = 0; i < batchSize; ++i) {
size_t next = cur + maxLength;
int idx = *(indices + i);
for (int v : buffer[idx * fieldNum + field]) {
data[cur++] = v;
}
/* pad zeros */
while (cur < next) {
data[cur++] = 0;
}
}
InitTensor2D(&t, batchSize, maxLength, X_INT);
t.SetData(data, maxLength * batchSize);
bufferUsed += batchSize;
delete[] data;
}
/*
the constructor of DataSet
>>> fname - path of the data file
>>> paraFieldNum - the number of different fields
>>> paraBufferSize - size of each field in the buffer
the real size of buffer is `bufferSize * fieldNum`
*/
DataSet::DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize)
{
fp = new ifstream(fname);
fieldNum = paraFieldNum;
bufferSize = paraBufferSize;
bufferUsed = 0;
buffer.reserve(bufferSize * fieldNum);
CheckNTErrors(fp, "unable to open the file: %s", fname);
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#include <vector>
using namespace std;
using BatchType = vector<vector<int>>;
using BucketType = vector<vector<int>>;
using BufferType = vector<vector<int>>;
namespace nts { // namespace nts(NiuTrans.Tensor)
/* A `DataSet` is associated with a file which contains variable length data.*/
class DataSet {
private:
/* the data buffer */
BufferType buffer;
/* the pointer to file stream */
ifstream* fp;
/* number of different fields */
size_t fieldNum;
/* size of the data buffer */
size_t bufferSize;
/* size of used data in buffer */
size_t bufferUsed;
/* load data from a file to the buffer */
void LoadDataToBuffer();
public:
/* sort data in the buffer */
virtual void Sort(){};
/* modify data in the buffer */
virtual void Process(){};
/* group data to buckets */
virtual BucketType Bucketing() { return BucketType(); };
/* generate a mini-batch */
void LoadBatch(XTensor& t, const int* indices, size_t batchSize, size_t field);
/* constructor */
explicit DataSet(const char* fname, size_t paraFieldNum, size_t paraBufferSize);
};
} // namespace nts(NiuTrans.Tensor)
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-02
*/
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <vector>
#include <cstdio>
#include <fstream>
#include <unordered_map>
using namespace std;
using namespace nts;
/*
* A `Embedding` is associated with an embeddings table(embedNum * embedDim).
* The table can be loaded from a file or constructed from an instance.
*/
class Embedding {
public:
XTensor embeddingTable;
public:
/* save the embeddings table to a file */
void Dump(const char* fname);
/* looks up ids in a list of embedding tensors */
XTensor EmbeddingLookup(const XTensor& ids);
/* load an embeddings table from a file */
explicit Embedding(const char* fname);
/* construct table from an XTensor instance */
explicit Embedding(XTensor& table);
/* construct table from a array instance */
explicit Embedding(const float* p, size_t embedNum, size_t embedDim);
};
#endif // __EMBEDDING_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "StringUtil.h"
namespace nts {
/* split string by delimiter, this will return indices of all sub-strings */
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
{
vector<pair<int, int>> fields;
if (delimiter.length() == 0) {
fields.emplace_back(0, s.length());
return fields;
}
int pos = 0;
int start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
fields.emplace_back(start, pos);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
fields.emplace_back(start, s.length());
}
return fields;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef __STRING_UTIL_H__
#define __STRING_UTIL_H__
#include <cstdlib>
#include <string>
#include <utility>
#include <vector>
using namespace std;
namespace nts {
/* Splits a string based on the given delimiter string. Each pair in the
* returned vector has the start and past-the-end positions for each of the
* parts of the original string. Empty fields are not represented in the output.
*/
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
/* Splits the given string and converts each part to the given T. */
template <typename T>
vector<T> Split(const string& s, const string& delimiter);
template <>
inline vector<string> Split(const string& s, const string& delimiter)
{
vector<string> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(s.substr(p.first, p.second - p.first));
}
return fields;
}
template <>
inline vector<int> Split(const string& s, const string& delimiter)
{
vector<int> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<int64_t> Split(const string& s, const string& delimiter)
{
vector<int64_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<float> Split(const string& s, const string& delimiter)
{
vector<float> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtod(s.data() + p.first, nullptr));
}
return fields;
}
template <>
inline vector<uint8_t> Split(const string& s, const string& delimiter)
{
vector<uint8_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<bool> Split(const string& s, const string& delimiter)
{
vector<bool> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(
static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
}
return fields;
}
} // namespace nts
#endif // __STRING_UTIL_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "Vocabulary.h"
#include "StringUtil.h"
#include <algorithm>
#include <fstream>
#include <iterator>
#include <map>
#include <string>
#include <vector>
using namespace nts;
/* initialize the mappings with some special tokens */
void nts::Vocabulary::Init()
{
const std::map<std::string, int> init{
{ "<UNK>", 0 },
{ "<PAD>", 1 },
{ "<SOS>", 2 },
{ "<EOS>", 3 },
};
for (const auto& v : init) {
token2ID[v.first] = v.second;
id2Token[v.second] = v.first;
}
}
/*
load vocabulary from disk
format(each line): token<\n>
*/
nts::Vocabulary::Vocabulary(const char* vocabPath)
{
Init();
std::ifstream file(vocabPath);
string line;
int index = int(token2ID.size());
while (getline(file, line)) {
size_t pos = line.find("\n");
string token = line.substr(0, pos);
token2ID[token] = index;
id2Token[index++] = token;
}
currentSize = int(token2ID.size());
}
/*
build vocabulary from the source file
>> srcPath - the path of source file
>> minFreq - the user specified minimum count for a single token
>> maxSize - the user specified maxmum size of a vocabulary
this will sort tokens by frequency and map the most uncommon tokens to <UNK>
*/
nts::Vocabulary::Vocabulary(const char* srcPath, int minFreq, int maxSize)
{
Init();
using Dict = vector<pair<string, int>>;
/* count word and their frequency */
std::ifstream file(srcPath);
map<string, int> dict;
string token;
while (file >> token) {
dict[token] += 1;
}
/* sort tokens by frequency */
Dict dictFlatten;
copy(dict.begin(), dict.end(), back_inserter<Dict>(dictFlatten));
auto myCompare = [](const auto& a, const auto& b) {
return a.second < b.second;
};
sort(dictFlatten.begin(), dictFlatten.end(), myCompare);
/* skip token whose frequency is out of top-maxSize or less than minFreq */
size_t index = token2ID.size();
size_t validSize = maxSize > dict.size() ? maxSize : dict.size();
for (const auto& tokenID : dictFlatten) {
if (validSize && tokenID.second > minFreq &&
token2ID.find(tokenID.first) == token2ID.end()) {
id2Token[int(index)] = tokenID.first;
token2ID[tokenID.first] = int(index);
index++;
validSize--;
}
}
currentSize = int(token2ID.size());
}
/* insert a token to the vocabulary */
void nts::Vocabulary::Insert(string token)
{
if (token2ID.find(token) != token2ID.end()) {
token2ID[token] = currentSize;
id2Token[currentSize++] = token;
}
}
/* save the vocabulary to a file */
void nts::Vocabulary::Dump(const char* vocabPath)
{
std::ofstream file(vocabPath);
for (const auto& tokenID : token2ID)
file << tokenID.first << "\t" << tokenID.second << "\n";
}
/*
maps tokens to integers
>>> tokens - a list of tokens
<<< indices - a list of corresponding indices
notices that this will map OOV to UNK
*/
vector<int> nts::Vocabulary::Token2ID(vector<string> tokens)
{
vector<int> indices;
indices.reserve(tokens.size());
for (auto str : tokens) {
if (token2ID.find(str) == token2ID.end())
indices.emplace_back(token2ID["<UNK>"]);
else
indices.emplace_back(token2ID[str]);
}
return indices;
}
/*
maps integers to tokens
>>> indices - a list of indices
<<< tokens - a list of corresponding tokens
notices that this will throw a error if the id is not found
*/
vector<string> nts::Vocabulary::ID2Token(vector<int> ids)
{
vector<string> tokens;
tokens.reserve(ids.size());
for (auto id : ids) {
CheckNTErrors(id2Token.find(id) != id2Token.end(), "id not found!");
tokens.emplace_back(id2Token[id]);
}
return tokens;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __VOCABULARY_H__
#define __VOCABULARY_H__
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
* A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
* Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
* For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
*/
class Vocabulary {
private:
/* current size of the vocabulary */
int currentSize = 0;
/* initialize mappings with some special tokens */
void Init();
public:
/* a dict maps tokens to indices */
unordered_map<string, int> token2ID;
/* a dict maps indices to tokens */
unordered_map<int, string> id2Token;
/* save the vocabulary to a file */
void Dump(const char* vocabPath);
/* append token to the vocabulary */
void Insert(string token);
/* maps integers to tokens */
vector<string> ID2Token(vector<int> ids);
/* maps tokens to integers */
vector<int> Token2ID(vector<string> tokens);
/* constructor */
/* load the vocabulary from a file */
explicit Vocabulary(const char* vocabPath);
/* built the vocabulary from the source file */
explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
};
} // namespace nts(NiuTrans.Tensor)
#endif // __VOCABULARY_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "DataReader.h"
#include "StringUtil.h"
#include <algorithm>
#include <fstream>
#include <iterator>
#include <map>
#include <string>
#include <vector>
using namespace nts;
/* initialize the mappings with some special tokens */
void nts::Vocabulary::Init()
{
const std::map<std::string, int> init{
{ "<UNK>", 0 },
{ "<PAD>", 1 },
{ "<SOS>", 2 },
{ "<EOS>", 3 },
};
for (const auto& v : init) {
token2ID[v.first] = v.second;
id2Token[v.second] = v.first;
}
}
/*
load vocabulary from disk
format(each line): token<\n>
notice that there are some pre-defined special tokens
*/
nts::Vocabulary::Vocabulary(const char* vocabPath)
{
Init();
std::ifstream file(vocabPath);
string line;
int index = int(token2ID.size());
while (getline(file, line)) {
size_t pos = line.find("\n");
string token = line.substr(0, pos);
token2ID[token] = index;
id2Token[index++] = token;
}
currentSize = int(token2ID.size());
}
/*
build vocabulary from the source file
>> srcPath - the path of source file
>> minFreq - the user specified minimum count for a single token
>> maxSize - the user specified maxmum size of a vocabulary
we sort tokens by their frequency, the most uncommon tokens will be mapped to <UNK>
if the amount of different tokens is larger than paraMaxSize.
*/
nts::Vocabulary::Vocabulary(const char* srcPath, int minFreq, int maxSize)
{
Init();
/* count word and their frequency */
std::ifstream file(srcPath);
map<string, int> dict;
string token;
while (file >> token) {
dict[token] += 1;
}
/* sort word by their frequency */
vector<pair<string, int>> dictFlatten;
copy(dict.begin(), dict.end(), back_inserter<vector<pair<string, int>>>(dictFlatten));
auto myCompare = [](const auto& a, const auto& b) {
return a.second < b.second;
};
sort(dictFlatten.begin(), dictFlatten.end(), myCompare);
/* skip token whose frequency is out of top-maxSize or less than minFreq */
size_t index = token2ID.size();
size_t validSize = maxSize > dict.size() ? maxSize : dict.size();
for (const auto& tokenID : dictFlatten) {
if (validSize && tokenID.second > minFreq && token2ID.find(tokenID.first) == token2ID.end()) {
id2Token[int(index)] = tokenID.first;
token2ID[tokenID.first] = int(index);
index++;
validSize--;
}
}
currentSize = int(token2ID.size());
}
/* insert a token to the vocabulary */
void nts::Vocabulary::Insert(string token)
{
if (token2ID.find(token) != token2ID.end()) {
token2ID[token] = currentSize;
id2Token[currentSize++] = token;
}
}
/* save the vocabulary to a file */
void nts::Vocabulary::Dump(const char* vocabPath)
{
std::ofstream file(vocabPath);
for (const auto& tokenID : token2ID)
file << tokenID.first << "\t" << tokenID.second << "\n";
}
/*
maps tokens to integers
>>> tokens - a list of tokens
<<< indices - a list of corresponding indices
notices that this will map OOV to UNK
*/
vector<int> nts::Vocabulary::Token2ID(vector<string> tokens)
{
vector<int> indices;
indices.reserve(tokens.size());
for (auto str : tokens) {
if (token2ID.find(str) == token2ID.end())
indices.emplace_back(token2ID["<UNK>"]);
else
indices.emplace_back(token2ID[str]);
}
return indices;
}
/*
maps integers to tokens
>>> indices - a list of indices
<<< tokens - a list of corresponding tokens
notices that this will throw a error if the id is not found
*/
vector<string> nts::Vocabulary::ID2Token(vector<int> ids)
{
vector<string> tokens;
tokens.reserve(ids.size());
for (auto id : ids) {
CheckNTErrors(id2Token.find(id) != id2Token.end(), "id not found!");
tokens.emplace_back(id2Token[id]);
}
return tokens;
}
/*
shuffle lines of the file
>> srcFile - the source file to shuffle
this will create a file "<srcFile>.random"
*/
void nts::Shuffle(const char* srcFile)
{
string command = "shuf ";
string tgtFile = srcFile;
tgtFile += ".random";
command += srcFile;
command += " > ";
command += tgtFile;
#ifndef WIN32
system(command.c_str());
#else
ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
#endif
}
/*
load data to the buffer
data format(a single line): "s1 s2 ... sn|||t1 t2 ... tm"
where s is the source token and t is the target token
this will transfer raw tokens to indices according to the vocabulary
>>> bufferSize - the number of sentences in the buffer
*/
void nts::DataSet::LoadDataToBuf()
{
dataBuf.clear();
string line;
const string srcTgtDelimiter = "|||";
const string sentDelimiter = " ";
int counter = bufferSize;
while (getline(*fp, line) && counter) {
auto subStrings = Split<string>(line, srcTgtDelimiter);
auto src = Split<string>(subStrings[0], sentDelimiter);
auto tgt = Split<string>(subStrings[1], sentDelimiter);
auto srcIndices = srcVocab->Token2ID(src);
auto tgtIndices = tgtVocab->Token2ID(tgt);
dataBuf.emplace_back(srcIndices, tgtIndices);
counter--;
}
}
/*
splits the data buffer to buckets
elements of the buffer are grouped together by length and batched
some samples may never be selected due to their unsuitable length
>>> batchSize - the maximum number of tokens
<<< buckets - a list of tokens indices in a mini-batch
*/
void nts::DataManager::BucketSeqByLen(int batchSize)
{
dataSet->LoadDataToBuf();
auto myCompare = [](const auto& a, const auto& b) {
return a.first.size() < b.first.size();
};
sort(dataSet->dataBuf.begin(), dataSet->dataBuf.end(), myCompare);
int realBatchSize = 0;
for (int i = 0; i < dataSet->dataBuf.size(); ++i) {
/* the first allocation */
if (!buckets.size()) {
buckets.emplace_back(1);
buckets.back().pop_back();
}
/* append the current sentence to bucket */
size_t seqLen = dataSet->dataBuf[i].first.size();
size_t seqNum = buckets.back().size();
if ((seqNum + 1) * seqLen <= batchSize &&
seqLen <= maxSeqLen && seqLen >= minSeqLen) {
buckets.back().push_back(i);
}
/* allocate a new bucket */
else {
buckets.emplace_back(1);
buckets.back().pop_back();
}
}
/* drop the last bucket if its size is too small */
if (buckets.back().size() * dataSet->dataBuf.back().first.size() < batchSize && buckets.size()>1)
buckets.pop_back();
}
/*
load a batch of data
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> devID - device id
>> mem - memory pool
*/
void nts::DataManager::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* label, int devID, XMem* mem)
{
if (bucketIter == buckets.size() - 1 || buckets.size() == 0) {
BucketSeqByLen(batchSize);
bucketIter = 0;
}
auto bucket = buckets[bucketIter];
bucketIter++;
size_t sentenceCount = bucket.size();
size_t maxEnc = dataSet->dataBuf[bucket.back()].first.size();
size_t maxDec = dataSet->dataBuf[bucket.back()].second.size();
InitTensor2D(batchEnc, int(sentenceCount), int(maxEnc), X_INT, devID, mem);
InitTensor2D(paddingEnc, int(sentenceCount), int(maxEnc), X_FLOAT, devID, mem);
InitTensor2D(batchDec, int(sentenceCount), int(maxDec), X_INT, devID, mem);
InitTensor2D(paddingDec, int(sentenceCount), int(maxDec), X_FLOAT, devID, mem);
InitTensor2D(label, int(sentenceCount), int(maxDec), X_INT, devID, mem);
int* labelValues = new int[batchDec->unitNum];
int* batchEncValues = new int[batchEnc->unitNum];
int* batchDecValues = new int[batchDec->unitNum];
float* paddingEncOffsets = new float[batchEnc->unitNum];
float* paddingDecOffsets = new float[batchDec->unitNum];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
memset(paddingEncOffsets, 0, sizeof(float) * batchEnc->unitNum);
memset(paddingDecOffsets, 0, sizeof(float) * batchDec->unitNum);
memset(labelValues, 0, sizeof(float) * batchDec->unitNum);
size_t srcIter = 0;
size_t tgtIter = 0;
for (const auto& index : bucket) {
size_t srcOffset = srcIter + dataSet->dataBuf[bucket.back()].first.size();
size_t tgtOffset = tgtIter + dataSet->dataBuf[bucket.back()].second.size();
for (const auto& src : dataSet->dataBuf[index].first)
batchEncValues[srcIter++] = src;
for (size_t i = 0; i < dataSet->dataBuf[index].second.size(); ++i) {
if (i != 0)
labelValues[tgtIter - 1] = dataSet->dataBuf[index].second[i];
batchDecValues[tgtIter++] = dataSet->dataBuf[index].second[i];
}
while (srcIter < srcOffset)
paddingEncOffsets[srcIter++] = 1.0F;
while (tgtIter < tgtOffset)
paddingDecOffsets[tgtIter++] = 1.0F;
}
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingEnc->SetData(paddingEncOffsets, batchEnc->unitNum);
paddingDec->SetData(paddingDecOffsets, batchDec->unitNum);
label->SetData(labelValues, label->unitNum);
delete[] batchEncValues;
delete[] batchDecValues;
delete[] labelValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef _DATA_READER_H_
#define _DATA_READER_H_
#include "../XGlobal.h"
#include "../XTensor.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
* A Vocabulary maps strings to integers, allowing for strings to be mapped to an out-of-vocabulary token.
* Vocabularies are fit to a particular dataset, which we use to decide which tokens are in-vocabulary.
* For convinience, a vocabulary can be built from a source file or be loaded from a vocabulary file.
*/
class Vocabulary {
private:
/* current size of the vocabulary */
int currentSize = 0;
/* initialize mappings with some special tokens */
void Init();
public:
/* a dict maps tokens to indices */
unordered_map<string, int> token2ID;
/* a dict maps indices to tokens */
unordered_map<int, string> id2Token;
Vocabulary() = delete;
Vocabulary(Vocabulary&&) = delete;
/* load the vocabulary from a file */
explicit Vocabulary(const char* vocabPath);
/* built the vocabulary from the source file */
explicit Vocabulary(const char* srcPath, int minFreq, int maxSize);
/* append token to the vocabulary */
void Insert(string token);
/* save the vocabulary to a file */
void Dump(const char* vocabPath);
/* maps tokens to integers */
vector<int> Token2ID(vector<string> tokens);
/* maps integers to tokens */
vector<string> ID2Token(vector<int> ids);
};
/*
* A DataSet maintains a sequences buffer.
* An instance is tied to a specified file and buffer size.
*/
class DataSet {
public:
/* number of buffered sequences */
int bufferSize;
/* the pointer to file stream */
ifstream* fp;
/* source-side vocabulary */
Vocabulary* srcVocab;
/* target-side vocabulary */
Vocabulary* tgtVocab;
/* sequences buffer */
vector<pair<vector<int>, vector<int>>> dataBuf;
DataSet(const DataSet&) = delete;
DataSet(DataSet&&) = delete;
/* constructor */
explicit DataSet(const char* fname, const char* srcVocabPath, const char* tgtVocabPath, int paraBufferSize, bool isShuffled)
{
bufferSize = paraBufferSize;
srcVocab = new Vocabulary(srcVocabPath);
tgtVocab = new Vocabulary(tgtVocabPath);
string trainFN = fname;
#ifndef WIN32
if (isShuffled) {
Shuffle(trainFN);
trainFN += ".random";
}
#endif
fp = new ifstream();
fp->open(trainFN);
CheckNTErrors(fp, "unable to open %s", fname);
}
/* destructor */
~DataSet()
{
fp->close();
}
/* loading data to the buffer */
void LoadDataToBuf();
};
/*
* A DataManager splits buffer to buckets and generates batches.
* It must be associated with a dataset.
*/
class DataManager {
private:
/* size of a mini-batch */
int batchSize;
/* the maximum length of a sequence */
int maxSeqLen;
/* the minimum length of a sequence */
int minSeqLen;
/* index of current bucket */
int bucketIter = 0;
/* the associated data set */
DataSet* dataSet;
/* buckets positon in buffered sequences */
vector<vector<int>> buckets;
DataManager() = delete;
DataManager(DataManager&&) = delete;
DataManager(const DataManager&) = delete;
/* splits sequences to buckets */
void BucketSeqByLen(int batchSize);
public:
/* constructor */
explicit DataManager(const char* fname,
const char* srcVocab, const char* tgtVocab,
int paraBufferSize, bool isShuffled,
int paraMaxSeqLen, int paraMinSeqLen, int paraBatchSize)
{
batchSize = paraBatchSize;
maxSeqLen = paraMaxSeqLen;
minSeqLen = paraMinSeqLen;
dataSet = new DataSet(fname, srcVocab, tgtVocab, paraBufferSize, isShuffled);
}
/* destructor */
~DataManager()
{
delete dataSet;
}
/* loads a mini-batch */
void LoadBatch(XTensor* batchEnc,
XTensor* paddingEnc, XTensor* batchDec,
XTensor* paddingDec, XTensor* label,
int devID, XMem* mem);
};
/* shuffle a file and pipe it to an output file */
void Shuffle(const char* srcFile);
} // namespace nts(NiuTrans.Tensor)
#endif // _DATA_READER_H_
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "T2TDataReader.h"
#include "StringUtil.h"
#include <algorithm>
#include <cstdio>
#include <fstream>
void nts::T2TDataSet::Process()
{
}
/* split buffer to buckets */
void nts::T2TDataManager::BucketSeqByLen(int batchSize)
{
t2tDataSet->LoadDataToBuf();
auto myCompare = [](const auto& a, const auto& b) {
return a.size() < b.size();
};
sort(dataSet->dataBuf.begin(), dataSet->dataBuf.end(), myCompare);
int realBatchSize = 0;
for (int i = 0; i < dataSet->dataBuf.size(); ++i) {
/* the first allocation */
if (!bucketBoundary.size()) {
bucketBoundary.emplace_back(1);
}
/* append the current sentence to bucket if the total size is suitable */
int seqLen = dataSet->dataBuf[i].first.size();
int seqNum = bucketBoundary.back().size();
if ((seqNum + 1) * seqLen < batchSize && seqLen <= maxSeqLen && seqLen >= minSeqLen) {
bucketBoundary.back().push_back(i);
}
/* allocate a new bucket */
else {
bucketBoundary.emplace_back(1);
}
}
/* drop the last bucket if its size is too small */
if (bucketBoundary.back().size() * dataSet->dataBuf.back().first.size() < batchSize / 2)
bucketBoundary.pop_back();
}
/*
load a batch of sequences (for MT)
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> devID - device id
>> mem - memory pool
>> isTraining - indicates whether we are training the model
*/
void nts::T2TDataManager::LoadBatchMT(XTensor* batchEnc,
XTensor* paddingEnc, XTensor* batchDec,
XTensor* paddingDec, XTensor* label,
int devID, XMem* mem, bool isTraining)
{
if (bucketIter == bucketBoundary.size() - 1) {
BucketSeqByLen(batchSize);
bucketIter = 0;
}
auto bucket = bucketBoundary[bucketIter++];
int sentenceCount = bucket.size();
int maxEnc = dataSet->dataBuf[bucket.back()].first.size();
int maxDec = dataSet->dataBuf[bucket.back()].second.size();
InitTensor2D(batchEnc, sentenceCount, maxEnc, X_INT, devID, mem);
InitTensor2D(paddingEnc, sentenceCount, maxEnc, X_FLOAT, devID, mem);
InitTensor2D(batchDec, sentenceCount, maxDec, X_INT, devID, mem);
InitTensor2D(paddingDec, sentenceCount, maxDec, X_FLOAT, devID, mem);
InitTensor2D(label, sentenceCount, maxDec, X_INT, devID, mem);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
label->SetZeroAll();
int* labelValues = new int[batchDec->unitNum];
int* batchEncValues = new int[batchEnc->unitNum];
int* batchDecValues = new int[batchDec->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[batchEnc->unitNum];
MTYPE* paddingDecOffsets = new MTYPE[batchDec->unitNum];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
memset(paddingEncOffsets, 0, sizeof(int) * batchEnc->unitNum);
memset(paddingDecOffsets, 0, sizeof(int) * batchDec->unitNum);
memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
int srcIter = 0;
int tgtIter = 0;
for (const auto& index : bucket) {
int srcOffset = srcIter + dataSet->dataBuf[bucket.back()].first.size();
int tgtOffset = tgtIter + dataSet->dataBuf[bucket.back()].second.size();
for (const auto& src : dataSet->dataBuf[index].first)
batchEncValues[srcIter++] = src;
for (const auto& src : dataSet->dataBuf[index].second)
batchDecValues[tgtIter++] = src;
while (srcIter < srcOffset)
paddingEncOffsets[srcIter++] = 1.0F;
while (tgtIter < tgtOffset)
paddingDecOffsets[tgtIter++] = 1.0F;
}
batchEnc->SetData(batchDecValues, batchEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingEnc->SetData(paddingEncOffsets, batchEnc->unitNum);
paddingDec->SetData(paddingDecOffsets, batchDec->unitNum);
label->SetData(labelValues, label->unitNum);
delete[] batchEncValues;
delete[] batchDecValues;
delete[] labelValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef _T2T_DATA_READER_H_
#define _T2T_DATA_READER_H_
#include "../XTensor.h"
#include "DataReader.h"
#define LOG(...) \
do { \
fprintf(stderr, "[INFO] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
#define ERR(...) \
do { \
fprintf(stderr, "[ERROR] "); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "\n"); \
fflush(stdout); \
} while (0)
using namespace std;
namespace nts { // namespace nts(NiuTrans.Tensor)
class T2TDataSet : public DataSet {
public:
/* source-side vocabulary */
Vocabulary* srcVocab;
/* target-side vocabulary */
Vocabulary* tgtVocab;
/* sequences buffer */
vector<pair<vector<int>, vector<int>>> t2tDataBuf;
/* constructor */
explicit T2TDataSet(const char* fname, int paraBufferSize,
const char* srcVocabPath, const char* tgtVocabPath, bool isShuffled)
: DataSet(fname, paraBufferSize, isShuffled)
{
srcVocab = new Vocabulary(srcVocabPath);
tgtVocab = new Vocabulary(tgtVocabPath);
}
/* destructor */
~T2TDataSet()
{
delete srcVocab;
delete tgtVocab;
}
/* loading setences to the buffer */
void LoadDataToBuf() override;
/* post processing of sequences after loading */
void Process() override;
};
class T2TDataManager : public DataManager {
public:
/* the associated data set */
T2TDataSet* t2tDataSet;
/* constructor */
explicit T2TDataManager(bool isShuffled, const char* fname,
int paraBatchSize, int paraMaxSeqLen,
int paraMinSeqLen, int paraBufferSize,
const char* srcVocabPath, const char* tgtVocabPath)
: DataManager(fname, paraBufferSize, isShuffled, paraMaxSeqLen, paraMinSeqLen, paraBatchSize)
{
maxSeqLen = paraMaxSeqLen;
minSeqLen = paraMinSeqLen;
t2tDataSet = new T2TDataSet(fname, paraBufferSize, srcVocabPath, tgtVocabPath, isShuffled);
}
/* */
void LoadBatchMT(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* label, int devID, XMem* mem,
bool isTraining);
};
} // namespace nts(NiuTrans.Tensor)
#endif // _T2T_DATA_READER_H_
我 你 他 <EOS> ||| <SOS> I You He <EOS>
我 你 <EOS> ||| <SOS> I You <EOS>
\ No newline at end of file
\ No newline at end of file
I
You
He
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论