Commit 1a687dab by xiaotong

bug fixes

parent 117d5109
...@@ -116,12 +116,22 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask) ...@@ -116,12 +116,22 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
XTensor att; XTensor att;
XTensor dot; XTensor dot;
XTensor scalar; XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS); dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked) if(isMasked)
dot = dot + mask; dot = dot + mask;
scalar = Softmax(Linear(dot, 1.0F/(float)sqrt((float)dk)), -1);
dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0) //if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F); // _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
......
...@@ -103,6 +103,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -103,6 +103,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor fnn; XTensor fnn;
XTensor res; XTensor res;
llnum = -1;
/* we skip the residual connection for the first layer if /* we skip the residual connection for the first layer if
the encoder is used in language modeling. */ the encoder is used in language modeling. */
if(skipInputRes && i == 0){ if(skipInputRes && i == 0){
...@@ -115,6 +117,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -115,6 +117,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
x = attLayerNorms[i].Make(att); x = attLayerNorms[i].Make(att);
} }
else{ else{
//if(i == 1)
// x.Dump(tf, "x:");
//if(i == 1)
// llnum = 1;
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask); att = attentions[i].Make(x, x, x, mask);
...@@ -125,6 +132,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes) ...@@ -125,6 +132,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* layer normalization */ /* layer normalization */
x = attLayerNorms[i].Make(res); x = attLayerNorms[i].Make(res);
llnum = -1;
} }
/* fnn */ /* fnn */
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace transformer namespace transformer
{ {
/* constructor */ /* constructor */
T2TLN::T2TLN() T2TLN::T2TLN()
{ {
......
...@@ -130,7 +130,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding) ...@@ -130,7 +130,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
_ScaleAndShiftMe(padding3, 1e9F, -1e9F); _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
_Sum(&mask, padding3, &mask); //_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true); encoding = MakeEncoding(input, mask, true);
outputLayer.Make(encoding, output); outputLayer.Make(encoding, output);
......
...@@ -90,7 +90,6 @@ void T2TTrainer::Init(int argc, const char ** argv) ...@@ -90,7 +90,6 @@ void T2TTrainer::Init(int argc, const char ** argv)
} }
FILE * tf = NULL;
int tc = 0; int tc = 0;
/* /*
...@@ -257,7 +256,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -257,7 +256,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf(); ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 512, isLenSorted, wc, devID, mem)){ while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, isLenSorted, wc, devID, mem)){
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch"); CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
...@@ -503,11 +502,11 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM, ...@@ -503,11 +502,11 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(w == seqLen[s] - 1) if(w == seqLen[s] - 1)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]); output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
wCount++; wCount++;
//fprintf(tf, "%d", buf[seqOffset[s] + w]); /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
//if(w < seqLen[s] - 1) if(w < seqLen[s] - 1)
// fprintf(tf, " "); fprintf(tf, " ");
//else else
// fprintf(tf, "\n"); fprintf(tf, "\n");*/
if(seqs != NULL) if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w]; seqs[seqSize++] = buf[seqOffset[s] + w];
} }
......
...@@ -27,6 +27,8 @@ namespace transformer ...@@ -27,6 +27,8 @@ namespace transformer
{ {
FILE * tmpFILE; FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP) void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{ {
......
...@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, ...@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
/* show arguments */ /* show arguments */
void ShowParams(int argc, const char ** argv); void ShowParams(int argc, const char ** argv);
extern int llnum;
extern FILE * tf;
} }
#endif #endif
...@@ -1377,9 +1377,10 @@ dump data to a file ...@@ -1377,9 +1377,10 @@ dump data to a file
>> file - where to domp the data >> file - where to domp the data
>> label - label of the tensor >> label - label of the tensor
>> n - number of items to dump >> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level >> verbose - verbose level
*/ */
void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose) void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
if (verbose > verboseLevel) if (verbose > verboseLevel)
return; return;
...@@ -1437,28 +1438,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo ...@@ -1437,28 +1438,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
} }
if (!isSparse) { if (!isSparse) {
if (dataType == DEFAULT_DTYPE) { if (dataType == DEFAULT_DTYPE) {
if (unitNum > 0) { int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
DTYPE f = *(DTYPE*)d; for(int i = beg; i < end; i++){
fprintf(file, "%e", f); DTYPE f = ((DTYPE*)d)[i];
} if(i == beg)
int num = unitNum; fprintf(file, "%e", f);
if (n > 0) else
num = MIN(num, n); fprintf(file, " %e", f);
for (int i = 1; i < num; i++) {
DTYPE * f = ((DTYPE*)d) + i;
fprintf(file, " %e", *f);
} }
} }
else { else {
ShowNTErrors("Cannot dump the tensor to the file in non-float values!"); ShowNTErrors("TODO!");
} }
} }
else { else {
int num = this->unitNumNonZero > 0 ? *(int*)d : 0; int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
if (n > 0) if (beg + n > 0)
num = MIN(num, n); num = MIN(num, beg + n);
fprintf(file, "%d ", num); fprintf(file, "%d ", num);
for (int i = 0; i < num; i++) { for (int i = beg; i < num; i++) {
int key = GetKeyInSparse(i); int key = GetKeyInSparse(i);
DTYPE value = GetInSparse(i); DTYPE value = GetInSparse(i);
fprintf(file, "[%d]%e ", key, value); fprintf(file, "[%d]%e ", key, value);
...@@ -1481,13 +1480,14 @@ dump data to a file ...@@ -1481,13 +1480,14 @@ dump data to a file
>> file - where to domp the data >> file - where to domp the data
>> label - label of the tensor >> label - label of the tensor
>> n - number of items to dump >> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level >> verbose - verbose level
*/ */
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose) void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem); XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a); _CopyValues(tensor, &a);
a.Dump(file, label, n, verbose); a.Dump(file, label, n, beg, verbose);
} }
/* /*
......
...@@ -339,11 +339,11 @@ public: ...@@ -339,11 +339,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const; bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0); void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */ /* dump data to a file */
static static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0); void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */ /* read data from a file */
void Read(FILE * file, const char * label = NULL); void Read(FILE * file, const char * label = NULL);
......
...@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum, ...@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
if (tid < 32){ if (tid < 32){
if (tid < blockDim.y / 32) if (tid < blockDim.y / 32)
threadMax = data[tid]; threadMax = data[tid];
else threadMax = 0; else threadMax = FLOAT_MIN;
threadMax = shflDownReduceMax(threadMax); threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum) if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论