Commit 77663a3c by Tianzhi

delete amax, asum, etc. delete global variable useBLAS also.

parent e42e0bf7
5932 5359 13569 5 3 11 84 1322 5 42 6918 4 467 208 4 3 28 793 3989 15 4 62 1860 924 126 6 2 ||| 1 2930 10 4 79 17 7439 1268 5 19466 23 2453 25 37 81 5834 130 4 79 17 270 65 9 4 681 73 7 152 129 6 270 65 5 19466 23 343 4 146 27 12 215 3529 6 1186 1112 4 30 17 2270 7 2
63 753 1765 4 2695 44 283 1525 4 454 9 12994 11934 8 407 5 463 7158 7294 1106 4 2376 50 499 60 8 31 5 555 577 4 66 6 2 ||| 1 154 5 3820 27 1443 6 4319 275 5 12 920 264 8 2091 6 6700 733 10 4 3776 8784 222 6 374 222 26 69 1770 9 2738 1421 10 12 2441 2250 4 54 17 6 68 17 649 6 403 7 2
25 1329 6208 4 1452 4 50 61 1719 88 5658 5220 10 4375 4 2973 4 5141 125 1652 2551 2830 5 4312 925 4 12802 4 4690 8 2918 6 2 ||| 1 10 190 9 2689 21 4 1109 1806 5 4 969 3106 218 6 3106 218 545 1318 262 9 9936 6 114 4 13737 5 18993 5 4572 5 6 1424 391 10 4 3 1871 5 4 6359 1871 5 6 4 2054 1871 7 2
......@@ -14,7 +14,7 @@ CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
USE_MKL = 1
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
......
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
......@@ -121,12 +121,18 @@ int FNNLMMain(int argc, const char ** argv)
/* load arguments */
LoadArgs(argc, argv, model);
printf("After load argu\n");
/* check the setting */
Check(model);
printf("After check setting\n");
/* initialize model parameters */
Init(model);
printf("After init model\n");
/* learn model parameters */
if(strcmp(trainFN, ""))
Train(trainFN, shuffled, model);
......@@ -414,7 +420,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make a model to keep gradients */
FNNModel grad;
printf("before copy\n");
Copy(grad, model);
printf("after copy\n");
/* XNet for automatic differentiation */
XNet autoDiffer;
......@@ -455,6 +463,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
printf("after make the gold tensor\n");
if(!autoDiff){
/* prepare an empty network for building the fnn */
......@@ -473,19 +482,23 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Update(model, grad, learningRate, false);
}
else{
printf("in autodiff\n");
/* gradient = 0 */
Clear(model, true);
printf("after clear\n");
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
printf("after implemented by gather function\n");
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
printf("after autodiff\n");
/* update model parameters */
Update(model, grad, learningRate, true);
......@@ -992,6 +1005,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
int size = batch * (n-1);
int * index = new int[size];
printf("in FAutoDiff, before bianli\n");
for(int i = 0; i < batch; i++){
for (int j = 0; j < n-1; j++){
int a = i * (n - 1) + j;
......@@ -999,9 +1013,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
}
}
printf("in FAutoDiff, before init tnesor 1d\n");
InitTensor1D(&words, size, X_INT, model.devID, model.mem);
words.SetData(index, size);
printf("in FAutoDiff, before gather\n");
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
......@@ -1010,13 +1026,16 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
printf("in FAutoDiff, before reshape\n");
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
printf("in FAutoDiff, before hidden layers\n");
/* hidden layers */
for(int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */
printf("in FAutoDiff, before output layer\n");
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
......
......@@ -186,7 +186,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if(isShuffled)
Shuffle(fn, trainFN);
#endif
printf("%s\n",trainFN);
FILE * file = fopen(trainFN, "rb");
CheckNTErrors(file, "cannot open training file!");
......@@ -286,7 +286,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break;
}
if (step % 100 == 0) {
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
......
......@@ -30,6 +30,7 @@
#include "XDevice.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./XBLAS.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -46,6 +47,7 @@ void PowerTest();
int main( int argc, const char ** argv )
{
LoadBLAS("/opt/Openblas/libopenblas.so");
//PowerTest();
//LittleTest();
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,10 +17,10 @@
/*
*
* This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/)
* This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/)
* libraries. By using BLAS, we can access very fast matrix operations although they
* are also implemented in NiuTrans in a native manner. To use BLAS,
* set USE_BLAS.
* are also implemented in NiuTrans in a native manner. To use BLAS,
* set USE_BLAS.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-04-08
*
......@@ -45,34 +45,48 @@ HINSTANCE hBLASDll;
/* single-precision floating matrix-matrix multiplication */
void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating matrix-matrix multiplication */
void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
double *, OPENBLAS_CONST BLASINT);
/* single-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
double *, OPENBLAS_CONST BLASINT);
float (*XBLAS_SASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
double (*XBLAS_DASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_SNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
double (*XBLAS_DNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a, float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_DSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double a, double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST double *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_SAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT incy);
/* set the number of threads */
void (*XBLAS_SET_THREAD_NUM)(int);
......@@ -123,11 +137,25 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
......@@ -163,29 +191,51 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
(FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads");
#endif // defined(MKL)
#else // _WIN32
XBLAS_SGEMM = &cblas_sgemm;
XBLAS_DGEMM = &cblas_dgemm;
XBLAS_SGER = &cblas_sger;
XBLAS_DGER = &cblas_dger;
XBLAS_SASUM = &cblas_sasum;
XBLAS_DASUM = &cblas_dasum;
XBLAS_ISAMAX = &cblas_isamax;
XBLAS_IDAMAX = &cblas_idamax;
XBLAS_ISAMIN = &cblas_isamin;
XBLAS_IDAMIN = &cblas_idamin;
XBLAS_SNRM2 = &cblas_snrm2;
XBLAS_DNRM2 = &cblas_dnrm2;
XBLAS_SSCAL = &cblas_sscal;
XBLAS_DSCAL = &cblas_dscal;
XBLAS_SCOPY = &cblas_scopy;
XBLAS_DCOPY = &cblas_dcopy;
XBLAS_SAXPY = &cblas_saxpy;
XBLAS_DAXPY = &cblas_daxpy;
#if defined(OPENBLAS)
XBLAS_SET_THREAD_NUM = &openblas_set_num_threads;
......
/* NiuTrans.Tensor - an open-source tensor library
 /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
......@@ -50,7 +50,7 @@ int CONST_MINUSONE = -1;
bool CONST_TRUE = true;
int verboseLevel = 0;
bool useBLAS = false;
bool useBLAS = true;
bool useCUDA = false;
FILE * tmpLog = NULL;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -43,7 +43,7 @@
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
#define _XINLINE_
#define _XINLINE_
//#define DOUBELPRICSION
......@@ -155,11 +155,28 @@ extern bool useCUDA;
#define B2I(V) V==0?false:true
#define SCAL XBLAS_SSCAL
/* BLAS interfaces */
#ifdef DOUBELPRICSION
#define GEMM XBLAS_DGEMM
#define GER XBLAS_DGER
#define ASUM XBLAS_DASUM
#define IAMAX XBLAS_IDAMAX
#define IAMIN XBLAS_IDAMIN
#define NRM2 XBLAS_DNRM2
#define SCAL XBLAS_DSCAL
#define COPY XBLAS_DCOPY
#define AXPY XBLAS_DAXPY
#else
#define GEMM XBLAS_SGEMM
#define GER XBLAS_SGER
#define ASUM XBLAS_SASUM
#define IAMAX XBLAS_ISAMAX
#define IAMIN XBLAS_ISAMIN
#define NRM2 XBLAS_SNRM2
#define SCAL XBLAS_SSCAL
#define COPY XBLAS_SCOPY
#define AXPY XBLAS_SAXPY
#endif
extern void InitGlobalAll();
......
......@@ -62,6 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* we transform a higher order tensor to a matrix to kill the number
of calls of matrix multiplication */
printf("in MMUL\n");
if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
int ncolA = a->dimSize[a->order - 1];
int ncolC = c->dimSize[c->order - 1];
......@@ -69,7 +70,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
a2->data = a->data;
c2->data = c->data;
printf("before _MatMul\n");
_MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
printf("after _MatMul\n");
a2->data = NULL;
c2->data = NULL;
delete a2;
......@@ -117,6 +120,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
bool isSparseMul = false;
printf("before bianli\n");
for (int p = 0; p < aBlockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
for (int q = 0; q < bBlockNum; q++) {
......@@ -143,6 +147,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
}
}
printf("after bianli\n");
if (isSparseMul) {
for (int i = 0; i < aList->count; i++) {
......@@ -174,9 +179,11 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
printf("before _MatMul\n");
_MatrixMulBatchedCPU(aList, transposedA,
bList, transposedB,
cList, alpha, beta);
printf("after _MatMul\n");
}
for (int i = 0; i < aList->count; i++) {
......
......@@ -82,10 +82,11 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
#if defined(USE_BLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
#else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
#endif
}
else {
// TODO!!
......
......@@ -201,10 +201,7 @@ CheckNTErrors((a && b && c), "Empty input tensors!");
bi->data = (char*)b->data + i * bRealBlockSize;
ci->data = (char*)c->data + i * cRealBlockSize;
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
......@@ -233,6 +230,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
{
printf("in _MMULBATCHED\n");
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
......@@ -264,10 +262,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
......
......@@ -76,7 +76,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
......@@ -84,12 +84,34 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data;
// when c != a, OpenBLAS needs to copy a to c first. This operation
// when c != a, OpenBLAS needs to copy a to c first. This operation
// slow down the speed, so just use OpenBLAS when c == a
if(useBLAS && c == a){
cblas_saxpy(a->unitNum,1,bp,1,cp,1);
#if defined(USE_BLAS)
if( c == a){
AXPY(a->unitNum,beta,bp,1,cp,1);
} else{
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
}
}
}
else{
#else
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
......@@ -111,8 +133,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
cp[i] = ap[i] + bp[i] * beta;
}
}
#endif
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
......@@ -124,7 +146,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
}
}
}
/*
tensor summation a = a + b * \beta (do it on site)
keep the result in the tensor a and return nothing
......@@ -138,7 +160,7 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
_Sum(a, b, a, beta);
}
/*
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor
>> b - another tensor for sum
......@@ -166,7 +188,7 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
else
return -1;
}
/*
tensor summation c = a + b * \beta (return an XTensor structure)
make a new tensor c to keep the result and return it
......@@ -186,7 +208,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
if(n == -1){
/* call _Sum function */
_Sum(&a, &b, &c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUM);
XLink::AddParamToHead(&c, beta);
......@@ -194,7 +216,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else if(n >= 0 && n < a.order){
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
XLink::AddParamToHeadInt(&c, n);
......@@ -203,7 +225,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else{
ShowNTErrors("Something is wrong!");
}
return c;
}
......
......@@ -54,7 +54,6 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bm = b->dimSize[1];
int cn = c->dimSize[0];
int cm = c->dimSize[1];
printf("4\n");
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -30,7 +30,7 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
/*
scale and shift all tensor entires
b = a * scale + shift
......@@ -71,8 +71,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
else{
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
if(shift == 0 && useBLAS && a==b){
cblas_sscal(b->unitNum, scale, vb, 1);
#if defined(USE_BLAS)
if(shift == 0 && a==b){
SCAL(b->unitNum, scale, vb, 1);
} else{
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
......@@ -80,10 +81,17 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++;
}
}
#else
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
#endif
}
}
/*
/*
scale and shift all tensor entires (do it on site)
keep the result in the input tensor a and return nothing
......@@ -98,7 +106,7 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
_ScaleAndShift(a, a, scale, shift);
}
/*
/*
scale and shift all tensor entires (return an XTensor structure)
make a new tensor to keep the result and return it
......@@ -113,15 +121,15 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
{
XTensor b(&a);
b.SetTMPFlag();
/* call _ScaleAndShift function */
_ScaleAndShift(&a, &b, scale, shift);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
XLink::AddParamToHead(&b, scale);
XLink::AddParamToHead(&b, shift);
return b;
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -28,7 +28,7 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
/*
get the max value of the items along a dimension of the tensor
>> input - the input tensor
......@@ -37,23 +37,23 @@ get the max value of the items along a dimension of the tensor
*/
void _ReduceMax(const XTensor * input, XTensor * output, int dim)
{
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
int dimRDI = input->order - dim - 1;
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
"Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
"Unmatched tensors!");
}
}
......@@ -82,9 +82,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
DTYPE * ip = (DTYPE*)input->data + blockSize * k;
DTYPE * op = (DTYPE*)output->data + stride * k;
for(int i = 0; i < stride; i++){
if(useBLAS){
*(op + i) = cblas_isamax(strideNum, ip + i, stride);
} else{
//#if defined(USE_BLAS)
// *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
//#else
DTYPE max = FLOAT_MIN;
DTYPE * ipe = ip + blockSize;
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
......@@ -93,13 +93,13 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
max = v;
}
*(op + i) = max;
}
//#endif
}
}
}
}
/*
/*
get the max value of the items along a dimension of the tensor (return an XTensor structure).
make a new tensor to keep the result and return it
......@@ -110,7 +110,7 @@ make a new tensor to keep the result and return it
XTensor ReduceMax(const XTensor &input, int dim)
{
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < order; i++){
......@@ -126,7 +126,7 @@ XTensor ReduceMax(const XTensor &input, int dim)
/* call _ReduceMax function */
_ReduceMax(&input, &output, dim);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
XLink::AddParamToHeadInt(&output, dim);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -25,10 +25,11 @@
#include "../../XName.h"
#include "../../XBLAS.h"
#include "../arithmetic/XTensorBLAS.h"
#include <iostream>
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
/*
sum the items along a dimension of the tensor
For a 1-dimensional data array a,
......@@ -44,7 +45,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
*/
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
......@@ -145,22 +146,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
else{
if(bias == 0){
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
//#if defined(USE_BLAS)
// sum = ASUM(strideNum, ip + i, stride);
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
//#endif
}
else if(power == (DTYPE)2.0){
if(useBLAS){
sum = cblas_snrm2(strideNum, ip + i, stride);
sum = sum * sum;
} else{
//#if defined(USE_BLAS)
// sum = NRM2(strideNum, ip + i, stride);
// sum = sum * sum;
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE value = (*ipb);
sum += value * value;
}
}
//#endif
}
else if(power == (DTYPE)0.5){
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
......@@ -177,11 +179,12 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
else{
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
//#if defined(USE_BLAS)
// sum = ASUM(strideNum, ip + i, stride);
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
//#endif
sum -= strideNum * bias;
}
else if(power == (DTYPE)2.0){
......@@ -210,7 +213,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
}
/*
/*
sum the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
......@@ -228,7 +231,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
{
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < order; i++){
......@@ -244,7 +247,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
/* call _ReduceSum function */
_ReduceSum(&input, &output, dim, &shift, power, isExp);
/* tensor connection */
XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
......@@ -257,7 +260,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
return output;
}
/*
/*
sum the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
......@@ -274,7 +277,7 @@ sum = \sum_i exp((a_i)^power) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
{
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < order; i++){
......@@ -290,7 +293,7 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
/* call _ReduceSum function */
_ReduceSum(&input, &output, dim, NULL, power, isExp);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
......
差异被折叠。 点击展开。
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论