Commit 77663a3c by Tianzhi

delete amax, asum, etc. delete global variable useBLAS also.

parent e42e0bf7
5932 5359 13569 5 3 11 84 1322 5 42 6918 4 467 208 4 3 28 793 3989 15 4 62 1860 924 126 6 2 ||| 1 2930 10 4 79 17 7439 1268 5 19466 23 2453 25 37 81 5834 130 4 79 17 270 65 9 4 681 73 7 152 129 6 270 65 5 19466 23 343 4 146 27 12 215 3529 6 1186 1112 4 30 17 2270 7 2
63 753 1765 4 2695 44 283 1525 4 454 9 12994 11934 8 407 5 463 7158 7294 1106 4 2376 50 499 60 8 31 5 555 577 4 66 6 2 ||| 1 154 5 3820 27 1443 6 4319 275 5 12 920 264 8 2091 6 6700 733 10 4 3776 8784 222 6 374 222 26 69 1770 9 2738 1421 10 12 2441 2250 4 54 17 6 68 17 649 6 403 7 2
25 1329 6208 4 1452 4 50 61 1719 88 5658 5220 10 4375 4 2973 4 5141 125 1652 2551 2830 5 4312 925 4 12802 4 4690 8 2918 6 2 ||| 1 10 190 9 2689 21 4 1109 1806 5 4 969 3106 218 6 3106 218 545 1318 262 9 9936 6 114 4 13737 5 18993 5 4572 5 6 1424 391 10 4 3 1871 5 4 6359 1871 5 6 4 2054 1871 7 2
......@@ -14,7 +14,7 @@ CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
USE_MKL = 1
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
......
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
......@@ -121,12 +121,18 @@ int FNNLMMain(int argc, const char ** argv)
/* load arguments */
LoadArgs(argc, argv, model);
printf("After load argu\n");
/* check the setting */
Check(model);
printf("After check setting\n");
/* initialize model parameters */
Init(model);
printf("After init model\n");
/* learn model parameters */
if(strcmp(trainFN, ""))
Train(trainFN, shuffled, model);
......@@ -414,7 +420,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make a model to keep gradients */
FNNModel grad;
printf("before copy\n");
Copy(grad, model);
printf("after copy\n");
/* XNet for automatic differentiation */
XNet autoDiffer;
......@@ -455,6 +463,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
printf("after make the gold tensor\n");
if(!autoDiff){
/* prepare an empty network for building the fnn */
......@@ -473,19 +482,23 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Update(model, grad, learningRate, false);
}
else{
printf("in autodiff\n");
/* gradient = 0 */
Clear(model, true);
printf("after clear\n");
/* forward + backward process */
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
printf("after implemented by gather function\n");
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
printf("after autodiff\n");
/* update model parameters */
Update(model, grad, learningRate, true);
......@@ -992,6 +1005,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
int size = batch * (n-1);
int * index = new int[size];
printf("in FAutoDiff, before bianli\n");
for(int i = 0; i < batch; i++){
for (int j = 0; j < n-1; j++){
int a = i * (n - 1) + j;
......@@ -999,9 +1013,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
}
}
printf("in FAutoDiff, before init tnesor 1d\n");
InitTensor1D(&words, size, X_INT, model.devID, model.mem);
words.SetData(index, size);
printf("in FAutoDiff, before gather\n");
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
......@@ -1010,12 +1026,15 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
printf("in FAutoDiff, before reshape\n");
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
printf("in FAutoDiff, before hidden layers\n");
/* hidden layers */
for(int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
printf("in FAutoDiff, before output layer\n");
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
......
......@@ -186,7 +186,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if(isShuffled)
Shuffle(fn, trainFN);
#endif
printf("%s\n",trainFN);
FILE * file = fopen(trainFN, "rb");
CheckNTErrors(file, "cannot open training file!");
......@@ -286,7 +286,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break;
}
if (step % 100 == 0) {
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
......
......@@ -30,6 +30,7 @@
#include "XDevice.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./XBLAS.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -46,6 +47,7 @@ void PowerTest();
int main( int argc, const char ** argv )
{
LoadBLAS("/opt/Openblas/libopenblas.so");
//PowerTest();
//LittleTest();
......
......@@ -68,11 +68,25 @@ void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OP
double *, OPENBLAS_CONST BLASINT);
float (*XBLAS_SASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
double (*XBLAS_DASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_SNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
double (*XBLAS_DNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a, float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_DSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double a, double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST double *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_SAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT incy);
/* set the number of threads */
void (*XBLAS_SET_THREAD_NUM)(int);
......@@ -123,11 +137,25 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
......@@ -163,11 +191,25 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
......@@ -181,11 +223,19 @@ void LoadBLAS(const char * dllFileName)
XBLAS_DGER = &cblas_dger;
XBLAS_SASUM = &cblas_sasum;
XBLAS_DASUM = &cblas_dasum;
XBLAS_ISAMAX = &cblas_isamax;
XBLAS_IDAMAX = &cblas_idamax;
XBLAS_ISAMIN = &cblas_isamin;
XBLAS_IDAMIN = &cblas_idamin;
XBLAS_SNRM2 = &cblas_snrm2;
XBLAS_DNRM2 = &cblas_dnrm2;
XBLAS_SSCAL = &cblas_sscal;
XBLAS_DSCAL = &cblas_dscal;
XBLAS_SCOPY = &cblas_scopy;
XBLAS_DCOPY = &cblas_dcopy;
XBLAS_SAXPY = &cblas_saxpy;
XBLAS_DAXPY = &cblas_daxpy;
#if defined(OPENBLAS)
XBLAS_SET_THREAD_NUM = &openblas_set_num_threads;
......
......@@ -37,6 +37,7 @@ namespace nts{
//#define OPENBLAS
#define OPENBLAS_CONST const
#define CBLAS_INDEX size_t
typedef int BLASINT;
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
......@@ -98,12 +99,25 @@ extern "C" void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BL
double *, OPENBLAS_CONST BLASINT);
extern "C" float (*XBLAS_SASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" float (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" float (*XBLAS_ISAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" double (*XBLAS_DASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX (*XBLAS_IDAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX (*XBLAS_ISAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX (*XBLAS_IDAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" float (*XBLAS_SNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" double (*XBLAS_DNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a, float *x,OPENBLAS_CONST BLASINT incx);
extern "C" void (*XBLAS_DSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double a, double *x,OPENBLAS_CONST BLASINT incx);
extern "C" void (*XBLAS_SCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy);
extern "C" void (*XBLAS_DCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST double *y,OPENBLAS_CONST BLASINT incy);
extern "C" void (*XBLAS_SAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy);
extern "C" void (*XBLAS_DAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT incy);
/* set the number of threads */
......@@ -144,12 +158,25 @@ extern "C" void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONS
double *A, OPENBLAS_CONST BLASINT lda);
extern "C" float cblas_sasum (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" float cblas_isamax (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" float cblas_isamin (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" double cblas_dasum (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX cblas_isamax (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX cblas_idamax (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX cblas_isamin (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" CBLAS_INDEX cblas_idamin (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" float cblas_snrm2 (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" void cblas_sscal (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
extern "C" double cblas_dnrm2 (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
extern "C" void cblas_sscal (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a, float *x,OPENBLAS_CONST BLASINT incx);
extern "C" void cblas_dscal (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double a, double *x,OPENBLAS_CONST BLASINT incx);
extern "C" void cblas_scopy (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy);
extern "C" void cblas_dcopy (OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST double *y,OPENBLAS_CONST BLASINT incy);
extern "C" void cblas_saxpy (OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy);
extern "C" void cblas_daxpy (OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT incy);
#if defined(OPENBLAS)
/* better control of multi-threading */
......
/* NiuTrans.Tensor - an open-source tensor library
 /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
......@@ -50,7 +50,7 @@ int CONST_MINUSONE = -1;
bool CONST_TRUE = true;
int verboseLevel = 0;
bool useBLAS = false;
bool useBLAS = true;
bool useCUDA = false;
FILE * tmpLog = NULL;
......
......@@ -155,11 +155,28 @@ extern bool useCUDA;
#define B2I(V) V==0?false:true
#define SCAL XBLAS_SSCAL
/* BLAS interfaces */
#ifdef DOUBELPRICSION
#define GEMM XBLAS_DGEMM
#define GER XBLAS_DGER
#define ASUM XBLAS_DASUM
#define IAMAX XBLAS_IDAMAX
#define IAMIN XBLAS_IDAMIN
#define NRM2 XBLAS_DNRM2
#define SCAL XBLAS_DSCAL
#define COPY XBLAS_DCOPY
#define AXPY XBLAS_DAXPY
#else
#define GEMM XBLAS_SGEMM
#define GER XBLAS_SGER
#define ASUM XBLAS_SASUM
#define IAMAX XBLAS_ISAMAX
#define IAMIN XBLAS_ISAMIN
#define NRM2 XBLAS_SNRM2
#define SCAL XBLAS_SSCAL
#define COPY XBLAS_SCOPY
#define AXPY XBLAS_SAXPY
#endif
extern void InitGlobalAll();
......
......@@ -62,6 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* we transform a higher order tensor to a matrix to kill the number
of calls of matrix multiplication */
printf("in MMUL\n");
if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
int ncolA = a->dimSize[a->order - 1];
int ncolC = c->dimSize[c->order - 1];
......@@ -69,7 +70,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
a2->data = a->data;
c2->data = c->data;
printf("before _MatMul\n");
_MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
printf("after _MatMul\n");
a2->data = NULL;
c2->data = NULL;
delete a2;
......@@ -117,6 +120,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
bool isSparseMul = false;
printf("before bianli\n");
for (int p = 0; p < aBlockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
for (int q = 0; q < bBlockNum; q++) {
......@@ -143,6 +147,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
}
}
printf("after bianli\n");
if (isSparseMul) {
for (int i = 0; i < aList->count; i++) {
......@@ -174,9 +179,11 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
}
else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
printf("before _MatMul\n");
_MatrixMulBatchedCPU(aList, transposedA,
bList, transposedB,
cList, alpha, beta);
printf("after _MatMul\n");
}
for (int i = 0; i < aList->count; i++) {
......
......@@ -82,10 +82,11 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
#if defined(USE_BLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
#else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
#endif
}
else {
// TODO!!
......
......@@ -201,10 +201,7 @@ CheckNTErrors((a && b && c), "Empty input tensors!");
bi->data = (char*)b->data + i * bRealBlockSize;
ci->data = (char*)c->data + i * cRealBlockSize;
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
......@@ -233,6 +230,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
{
printf("in _MMULBATCHED\n");
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
......@@ -264,10 +262,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS
if (useBLAS)
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
......
......@@ -86,10 +86,32 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
DTYPE * cp = (DTYPE*)c->data;
// when c != a, OpenBLAS needs to copy a to c first. This operation
// slow down the speed, so just use OpenBLAS when c == a
if(useBLAS && c == a){
cblas_saxpy(a->unitNum,1,bp,1,cp,1);
#if defined(USE_BLAS)
if( c == a){
AXPY(a->unitNum,beta,bp,1,cp,1);
} else{
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
}
else{
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
}
}
}
#else
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
......@@ -111,7 +133,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
cp[i] = ap[i] + bp[i] * beta;
}
}
}
#endif
}
else {
// TODO!!
......
......@@ -54,7 +54,6 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bm = b->dimSize[1];
int cn = c->dimSize[0];
int cm = c->dimSize[1];
printf("4\n");
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
......
......@@ -71,8 +71,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
else{
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
if(shift == 0 && useBLAS && a==b){
cblas_sscal(b->unitNum, scale, vb, 1);
#if defined(USE_BLAS)
if(shift == 0 && a==b){
SCAL(b->unitNum, scale, vb, 1);
} else{
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
......@@ -80,6 +81,13 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++;
}
}
#else
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
#endif
}
}
......
......@@ -82,9 +82,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
DTYPE * ip = (DTYPE*)input->data + blockSize * k;
DTYPE * op = (DTYPE*)output->data + stride * k;
for(int i = 0; i < stride; i++){
if(useBLAS){
*(op + i) = cblas_isamax(strideNum, ip + i, stride);
} else{
//#if defined(USE_BLAS)
// *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
//#else
DTYPE max = FLOAT_MIN;
DTYPE * ipe = ip + blockSize;
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
......@@ -93,7 +93,7 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
max = v;
}
*(op + i) = max;
}
//#endif
}
}
}
......
......@@ -25,6 +25,7 @@
#include "../../XName.h"
#include "../../XBLAS.h"
#include "../arithmetic/XTensorBLAS.h"
#include <iostream>
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -145,22 +146,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
else{
if(bias == 0){
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
//#if defined(USE_BLAS)
// sum = ASUM(strideNum, ip + i, stride);
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
//#endif
}
else if(power == (DTYPE)2.0){
if(useBLAS){
sum = cblas_snrm2(strideNum, ip + i, stride);
sum = sum * sum;
} else{
//#if defined(USE_BLAS)
// sum = NRM2(strideNum, ip + i, stride);
// sum = sum * sum;
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE value = (*ipb);
sum += value * value;
}
}
//#endif
}
else if(power == (DTYPE)0.5){
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
......@@ -177,11 +179,12 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
else{
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
//#if defined(USE_BLAS)
// sum = ASUM(strideNum, ip + i, stride);
//#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
//#endif
sum -= strideNum * bias;
}
else if(power == (DTYPE)2.0){
......
差异被折叠。 点击展开。
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论