Commit 1a687dab by xiaotong

bug fixes

parent 117d5109
......@@ -116,12 +116,22 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked)
dot = dot + mask;
scalar = Softmax(Linear(dot, 1.0F/(float)sqrt((float)dk)), -1);
dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
//if(llnum == 1)
// dot.Dump(tf, "dot:");
scalar = Softmax(dot, -1);
//if(llnum == 1)
// scalar.Dump(tf, "scalar:");
//if(ignored > 0)
// _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
......
......@@ -103,6 +103,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
XTensor fnn;
XTensor res;
llnum = -1;
/* we skip the residual connection for the first layer if
the encoder is used in language modeling. */
if(skipInputRes && i == 0){
......@@ -115,6 +117,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
x = attLayerNorms[i].Make(att);
}
else{
//if(i == 1)
// x.Dump(tf, "x:");
//if(i == 1)
// llnum = 1;
/* self attention */
att = attentions[i].Make(x, x, x, mask);
......@@ -125,6 +132,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
/* layer normalization */
x = attLayerNorms[i].Make(res);
llnum = -1;
}
/* fnn */
......
......@@ -27,7 +27,7 @@
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
......
......@@ -130,7 +130,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
_Sum(&mask, padding3, &mask);
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, true);
outputLayer.Make(encoding, output);
......
......@@ -90,7 +90,6 @@ void T2TTrainer::Init(int argc, const char ** argv)
}
FILE * tf = NULL;
int tc = 0;
/*
......@@ -257,7 +256,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 512, isLenSorted, wc, devID, mem)){
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, isLenSorted, wc, devID, mem)){
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
......@@ -503,11 +502,11 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(w == seqLen[s] - 1)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
wCount++;
//fprintf(tf, "%d", buf[seqOffset[s] + w]);
//if(w < seqLen[s] - 1)
// fprintf(tf, " ");
//else
// fprintf(tf, "\n");
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
......
......@@ -27,6 +27,8 @@ namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{
......
......@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p,
/* show arguments */
void ShowParams(int argc, const char ** argv);
extern int llnum;
extern FILE * tf;
}
#endif
......@@ -1377,9 +1377,10 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
{
if (verbose > verboseLevel)
return;
......@@ -1437,28 +1438,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
}
if (!isSparse) {
if (dataType == DEFAULT_DTYPE) {
if (unitNum > 0) {
DTYPE f = *(DTYPE*)d;
fprintf(file, "%e", f);
}
int num = unitNum;
if (n > 0)
num = MIN(num, n);
for (int i = 1; i < num; i++) {
DTYPE * f = ((DTYPE*)d) + i;
fprintf(file, " %e", *f);
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i];
if(i == beg)
fprintf(file, "%e", f);
else
fprintf(file, " %e", f);
}
}
else {
ShowNTErrors("Cannot dump the tensor to the file in non-float values!");
ShowNTErrors("TODO!");
}
}
else {
int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
if (n > 0)
num = MIN(num, n);
if (beg + n > 0)
num = MIN(num, beg + n);
fprintf(file, "%d ", num);
for (int i = 0; i < num; i++) {
for (int i = beg; i < num; i++) {
int key = GetKeyInSparse(i);
DTYPE value = GetInSparse(i);
fprintf(file, "[%d]%e ", key, value);
......@@ -1481,13 +1480,14 @@ dump data to a file
>> file - where to domp the data
>> label - label of the tensor
>> n - number of items to dump
>> beg - the first item id
>> verbose - verbose level
*/
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
_CopyValues(tensor, &a);
a.Dump(file, label, n, verbose);
a.Dump(file, label, n, beg, verbose);
}
/*
......
......@@ -339,11 +339,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* read data from a file */
void Read(FILE * file, const char * label = NULL);
......
......@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
else threadMax = FLOAT_MIN;
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论