Commit 77663a3c by Tianzhi

delete amax, asum, etc. delete global variable useBLAS also.

parent e42e0bf7
5932 5359 13569 5 3 11 84 1322 5 42 6918 4 467 208 4 3 28 793 3989 15 4 62 1860 924 126 6 2 ||| 1 2930 10 4 79 17 7439 1268 5 19466 23 2453 25 37 81 5834 130 4 79 17 270 65 9 4 681 73 7 152 129 6 270 65 5 19466 23 343 4 146 27 12 215 3529 6 1186 1112 4 30 17 2270 7 2
63 753 1765 4 2695 44 283 1525 4 454 9 12994 11934 8 407 5 463 7158 7294 1106 4 2376 50 499 60 8 31 5 555 577 4 66 6 2 ||| 1 154 5 3820 27 1443 6 4319 275 5 12 920 264 8 2091 6 6700 733 10 4 3776 8784 222 6 374 222 26 69 1770 9 2738 1421 10 12 2441 2250 4 54 17 6 68 17 649 6 403 7 2
25 1329 6208 4 1452 4 50 61 1719 88 5658 5220 10 4375 4 2973 4 5141 125 1652 2551 2830 5 4312 925 4 12802 4 4690 8 2918 6 2 ||| 1 10 190 9 2689 21 4 1109 1806 5 4 969 3106 218 6 3106 218 545 1318 262 9 9936 6 114 4 13737 5 18993 5 4572 5 6 1424 391 10 4 3 1871 5 4 6359 1871 5 6 4 2054 1871 7 2
...@@ -14,7 +14,7 @@ CUDA_LIB_DIR = $(CUDA_ROOT)/lib64 ...@@ -14,7 +14,7 @@ CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL # use MKL
USE_MKL = 0 USE_MKL = 1
INTEL_ROOT = /opt/intel INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/ MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
......
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
...@@ -121,12 +121,18 @@ int FNNLMMain(int argc, const char ** argv) ...@@ -121,12 +121,18 @@ int FNNLMMain(int argc, const char ** argv)
/* load arguments */ /* load arguments */
LoadArgs(argc, argv, model); LoadArgs(argc, argv, model);
printf("After load argu\n");
/* check the setting */ /* check the setting */
Check(model); Check(model);
printf("After check setting\n");
/* initialize model parameters */ /* initialize model parameters */
Init(model); Init(model);
printf("After init model\n");
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
Train(trainFN, shuffled, model); Train(trainFN, shuffled, model);
...@@ -414,7 +420,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -414,7 +420,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make a model to keep gradients */ /* make a model to keep gradients */
FNNModel grad; FNNModel grad;
printf("before copy\n");
Copy(grad, model); Copy(grad, model);
printf("after copy\n");
/* XNet for automatic differentiation */ /* XNet for automatic differentiation */
XNet autoDiffer; XNet autoDiffer;
...@@ -455,6 +463,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -455,6 +463,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* make the gold tensor */ /* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem); MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
printf("after make the gold tensor\n");
if(!autoDiff){ if(!autoDiff){
/* prepare an empty network for building the fnn */ /* prepare an empty network for building the fnn */
...@@ -473,19 +482,23 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -473,19 +482,23 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
Update(model, grad, learningRate, false); Update(model, grad, learningRate, false);
} }
else{ else{
printf("in autodiff\n");
/* gradient = 0 */ /* gradient = 0 */
Clear(model, true); Clear(model, true);
printf("after clear\n");
/* forward + backward process */ /* forward + backward process */
/* this is implemented by gather function */ /* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model); ForwardAutoDiff(ngrams, ngramNum, output, model);
printf("after implemented by gather function\n");
/* this is implemented by multiply function */ /* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model); //ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */ /* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY); autoDiffer.Backward(output, gold, CROSSENTROPY);
printf("after autodiff\n");
/* update model parameters */ /* update model parameters */
Update(model, grad, learningRate, true); Update(model, grad, learningRate, true);
...@@ -992,6 +1005,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -992,6 +1005,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
int size = batch * (n-1); int size = batch * (n-1);
int * index = new int[size]; int * index = new int[size];
printf("in FAutoDiff, before bianli\n");
for(int i = 0; i < batch; i++){ for(int i = 0; i < batch; i++){
for (int j = 0; j < n-1; j++){ for (int j = 0; j < n-1; j++){
int a = i * (n - 1) + j; int a = i * (n - 1) + j;
...@@ -999,9 +1013,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -999,9 +1013,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
} }
} }
printf("in FAutoDiff, before init tnesor 1d\n");
InitTensor1D(&words, size, X_INT, model.devID, model.mem); InitTensor1D(&words, size, X_INT, model.devID, model.mem);
words.SetData(index, size); words.SetData(index, size);
printf("in FAutoDiff, before gather\n");
embeddingBig = Gather(model.embeddingW, words); embeddingBig = Gather(model.embeddingW, words);
delete[] index; delete[] index;
...@@ -1010,13 +1026,16 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1010,13 +1026,16 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
dimSize[0] = embeddingBig.GetDim(0) / (n - 1); dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1); dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
printf("in FAutoDiff, before reshape\n");
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
printf("in FAutoDiff, before hidden layers\n");
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for(int i = 0; i < depth; i++)
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
/* output layer */ printf("in FAutoDiff, before output layer\n");
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
......
...@@ -186,7 +186,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -186,7 +186,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if(isShuffled) if(isShuffled)
Shuffle(fn, trainFN); Shuffle(fn, trainFN);
#endif #endif
printf("%s\n",trainFN);
FILE * file = fopen(trainFN, "rb"); FILE * file = fopen(trainFN, "rb");
CheckNTErrors(file, "cannot open training file!"); CheckNTErrors(file, "cannot open training file!");
...@@ -286,7 +286,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -286,7 +286,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
break; break;
} }
if (step % 100 == 0) { if (step % 1 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc)); elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "XDevice.h" #include "XDevice.h"
#include "./test/Test.h" #include "./test/Test.h"
#include "./core/CHeader.h" #include "./core/CHeader.h"
#include "./XBLAS.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
...@@ -46,6 +47,7 @@ void PowerTest(); ...@@ -46,6 +47,7 @@ void PowerTest();
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
LoadBLAS("/opt/Openblas/libopenblas.so");
//PowerTest(); //PowerTest();
//LittleTest(); //LittleTest();
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -17,10 +17,10 @@ ...@@ -17,10 +17,10 @@
/* /*
* *
* This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/) * This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/)
* libraries. By using BLAS, we can access very fast matrix operations although they * libraries. By using BLAS, we can access very fast matrix operations although they
* are also implemented in NiuTrans in a native manner. To use BLAS, * are also implemented in NiuTrans in a native manner. To use BLAS,
* set USE_BLAS. * set USE_BLAS.
* *
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-04-08 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-04-08
* *
...@@ -45,34 +45,48 @@ HINSTANCE hBLASDll; ...@@ -45,34 +45,48 @@ HINSTANCE hBLASDll;
/* single-precision floating matrix-matrix multiplication */ /* single-precision floating matrix-matrix multiplication */
void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE, void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
float *, OPENBLAS_CONST BLASINT); float *, OPENBLAS_CONST BLASINT);
/* double-precision floating matrix-matrix multiplication */ /* double-precision floating matrix-matrix multiplication */
void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE, void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
double *, OPENBLAS_CONST BLASINT); double *, OPENBLAS_CONST BLASINT);
/* single-precision floating vector-vector multiplication (rank-1) */ /* single-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha, void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
float *, OPENBLAS_CONST BLASINT); float *, OPENBLAS_CONST BLASINT);
/* double-precision floating vector-vector multiplication (rank-1) */ /* double-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha, void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
double *, OPENBLAS_CONST BLASINT); double *, OPENBLAS_CONST BLASINT);
float (*XBLAS_SASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx); float (*XBLAS_SASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx); double (*XBLAS_DASUM)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMAX)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_ISAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
CBLAS_INDEX (*XBLAS_IDAMIN)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
float (*XBLAS_SNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx); float (*XBLAS_SNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx); double (*XBLAS_DNRM2)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float a, float *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_DSCAL)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double a, double *x,OPENBLAS_CONST BLASINT incx);
void (*XBLAS_SCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy); void (*XBLAS_SCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST float *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST float *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DCOPY)(OPENBLAS_CONST BLASINT n,OPENBLAS_CONST double *x,OPENBLAS_CONST BLASINT incx,OPENBLAS_CONST double *y,OPENBLAS_CONST BLASINT incy);
void (*XBLAS_SAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy); void (*XBLAS_SAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT incy);
void (*XBLAS_DAXPY)(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT incy);
/* set the number of threads */ /* set the number of threads */
void (*XBLAS_SET_THREAD_NUM)(int); void (*XBLAS_SET_THREAD_NUM)(int);
...@@ -123,11 +137,25 @@ void LoadBLAS(const char * dllFileName) ...@@ -123,11 +137,25 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger"); (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum"); (FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax"); (FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2"); (FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal"); (FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy"); (FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy"); (FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */ /* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads"); (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
...@@ -163,29 +191,51 @@ void LoadBLAS(const char * dllFileName) ...@@ -163,29 +191,51 @@ void LoadBLAS(const char * dllFileName)
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger"); (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
(FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum"); (FARPROC&)XBLAS_SASUM = GetProcAddress(hBLASDll, "cblas_sasum");
(FARPROC&)XBLAS_DASUM = GetProcAddress(hBLASDll, "cblas_dasum");
(FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax"); (FARPROC&)XBLAS_ISAMAX = GetProcAddress(hBLASDll, "cblas_isamax");
(FARPROC&)XBLAS_IDAMAX = GetProcAddress(hBLASDll, "cblas_idamax");
(FARPROC&)XBLAS_ISAMIN = GetProcAddress(hBLASDll, "cblas_isamin");
(FARPROC&)XBLAS_IDAMIN = GetProcAddress(hBLASDll, "cblas_idamin");
(FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2"); (FARPROC&)XBLAS_SNRM2 = GetProcAddress(hBLASDll, "cblas_snrm2");
(FARPROC&)XBLAS_DNRM2 = GetProcAddress(hBLASDll, "cblas_dnrm2");
(FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal"); (FARPROC&)XBLAS_SSCAL = GetProcAddress(hBLASDll, "cblas_sscal");
(FARPROC&)XBLAS_DSCAL = GetProcAddress(hBLASDll, "cblas_dscal");
(FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy"); (FARPROC&)XBLAS_SCOPY = GetProcAddress(hBLASDll, "cblas_scopy");
(FARPROC&)XBLAS_DCOPY = GetProcAddress(hBLASDll, "cblas_dcopy");
(FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy"); (FARPROC&)XBLAS_SAXPY = GetProcAddress(hBLASDll, "cblas_saxpy");
(FARPROC&)XBLAS_DAXPY = GetProcAddress(hBLASDll, "cblas_daxpy");
/* multi-threading */ /* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads"); (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
(FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads"); (FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads");
#endif // defined(MKL) #endif // defined(MKL)
#else // _WIN32 #else // _WIN32
XBLAS_SGEMM = &cblas_sgemm; XBLAS_SGEMM = &cblas_sgemm;
XBLAS_DGEMM = &cblas_dgemm; XBLAS_DGEMM = &cblas_dgemm;
XBLAS_SGER = &cblas_sger; XBLAS_SGER = &cblas_sger;
XBLAS_DGER = &cblas_dger; XBLAS_DGER = &cblas_dger;
XBLAS_SASUM = &cblas_sasum; XBLAS_SASUM = &cblas_sasum;
XBLAS_DASUM = &cblas_dasum;
XBLAS_ISAMAX = &cblas_isamax; XBLAS_ISAMAX = &cblas_isamax;
XBLAS_IDAMAX = &cblas_idamax;
XBLAS_ISAMIN = &cblas_isamin;
XBLAS_IDAMIN = &cblas_idamin;
XBLAS_SNRM2 = &cblas_snrm2; XBLAS_SNRM2 = &cblas_snrm2;
XBLAS_DNRM2 = &cblas_dnrm2;
XBLAS_SSCAL = &cblas_sscal; XBLAS_SSCAL = &cblas_sscal;
XBLAS_DSCAL = &cblas_dscal;
XBLAS_SCOPY = &cblas_scopy; XBLAS_SCOPY = &cblas_scopy;
XBLAS_DCOPY = &cblas_dcopy;
XBLAS_SAXPY = &cblas_saxpy; XBLAS_SAXPY = &cblas_saxpy;
XBLAS_DAXPY = &cblas_daxpy;
#if defined(OPENBLAS) #if defined(OPENBLAS)
XBLAS_SET_THREAD_NUM = &openblas_set_num_threads; XBLAS_SET_THREAD_NUM = &openblas_set_num_threads;
......
/* NiuTrans.Tensor - an open-source tensor library  /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
...@@ -50,7 +50,7 @@ int CONST_MINUSONE = -1; ...@@ -50,7 +50,7 @@ int CONST_MINUSONE = -1;
bool CONST_TRUE = true; bool CONST_TRUE = true;
int verboseLevel = 0; int verboseLevel = 0;
bool useBLAS = false; bool useBLAS = true;
bool useCUDA = false; bool useCUDA = false;
FILE * tmpLog = NULL; FILE * tmpLog = NULL;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts {
#define _XINLINE_ #define _XINLINE_
//#define DOUBELPRICSION //#define DOUBELPRICSION
...@@ -155,11 +155,28 @@ extern bool useCUDA; ...@@ -155,11 +155,28 @@ extern bool useCUDA;
#define B2I(V) V==0?false:true #define B2I(V) V==0?false:true
#define SCAL XBLAS_SSCAL
/* BLAS interfaces */ /* BLAS interfaces */
#ifdef DOUBELPRICSION #ifdef DOUBELPRICSION
#define GEMM XBLAS_DGEMM #define GEMM XBLAS_DGEMM
#define GER XBLAS_DGER
#define ASUM XBLAS_DASUM
#define IAMAX XBLAS_IDAMAX
#define IAMIN XBLAS_IDAMIN
#define NRM2 XBLAS_DNRM2
#define SCAL XBLAS_DSCAL
#define COPY XBLAS_DCOPY
#define AXPY XBLAS_DAXPY
#else #else
#define GEMM XBLAS_SGEMM #define GEMM XBLAS_SGEMM
#define GER XBLAS_SGER
#define ASUM XBLAS_SASUM
#define IAMAX XBLAS_ISAMAX
#define IAMIN XBLAS_ISAMIN
#define NRM2 XBLAS_SNRM2
#define SCAL XBLAS_SSCAL
#define COPY XBLAS_SCOPY
#define AXPY XBLAS_SAXPY
#endif #endif
extern void InitGlobalAll(); extern void InitGlobalAll();
......
...@@ -62,6 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -62,6 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* we transform a higher order tensor to a matrix to kill the number /* we transform a higher order tensor to a matrix to kill the number
of calls of matrix multiplication */ of calls of matrix multiplication */
printf("in MMUL\n");
if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){ if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
int ncolA = a->dimSize[a->order - 1]; int ncolA = a->dimSize[a->order - 1];
int ncolC = c->dimSize[c->order - 1]; int ncolC = c->dimSize[c->order - 1];
...@@ -69,7 +70,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -69,7 +70,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem); XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
a2->data = a->data; a2->data = a->data;
c2->data = c->data; c2->data = c->data;
printf("before _MatMul\n");
_MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner); _MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
printf("after _MatMul\n");
a2->data = NULL; a2->data = NULL;
c2->data = NULL; c2->data = NULL;
delete a2; delete a2;
...@@ -117,6 +120,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -117,6 +120,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
bool isSparseMul = false; bool isSparseMul = false;
printf("before bianli\n");
for (int p = 0; p < aBlockNum; p++) { for (int p = 0; p < aBlockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p; void * ap = (char*)a->data + aRealBlockSize * p;
for (int q = 0; q < bBlockNum; q++) { for (int q = 0; q < bBlockNum; q++) {
...@@ -143,6 +147,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -143,6 +147,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
} }
} }
} }
printf("after bianli\n");
if (isSparseMul) { if (isSparseMul) {
for (int i = 0; i < aList->count; i++) { for (int i = 0; i < aList->count; i++) {
...@@ -174,9 +179,11 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -174,9 +179,11 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
} }
else { else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
printf("before _MatMul\n");
_MatrixMulBatchedCPU(aList, transposedA, _MatrixMulBatchedCPU(aList, transposedA,
bList, transposedB, bList, transposedB,
cList, alpha, beta); cList, alpha, beta);
printf("after _MatMul\n");
} }
for (int i = 0; i < aList->count; i++) { for (int i = 0; i < aList->count; i++) {
......
...@@ -82,10 +82,11 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -82,10 +82,11 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
b->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE) c->dataType == DEFAULT_DTYPE)
{ {
if (useBLAS) #if defined(USE_BLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta); _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else #else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner); _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
#endif
} }
else { else {
// TODO!! // TODO!!
......
...@@ -201,10 +201,7 @@ CheckNTErrors((a && b && c), "Empty input tensors!"); ...@@ -201,10 +201,7 @@ CheckNTErrors((a && b && c), "Empty input tensors!");
bi->data = (char*)b->data + i * bRealBlockSize; bi->data = (char*)b->data + i * bRealBlockSize;
ci->data = (char*)c->data + i * cRealBlockSize; ci->data = (char*)c->data + i * cRealBlockSize;
#ifdef USE_BLAS #ifdef USE_BLAS
if (useBLAS) _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else #else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta); _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif #endif
...@@ -233,6 +230,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -233,6 +230,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta) XList * c, DTYPE alpha, DTYPE beta)
{ {
printf("in _MMULBATCHED\n");
CheckNTErrors(a && b && c, "Empty input lists!"); CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!"); CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
...@@ -264,10 +262,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -264,10 +262,7 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!"); CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!"); CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS #ifdef USE_BLAS
if (useBLAS) _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
_MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else #else
_MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta); _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif #endif
......
...@@ -76,7 +76,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -76,7 +76,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
else { else {
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!"); CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE && if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE) c->dataType == DEFAULT_DTYPE)
...@@ -84,12 +84,34 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -84,12 +84,34 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
DTYPE * ap = (DTYPE*)a->data; DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data; DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data; DTYPE * cp = (DTYPE*)c->data;
// when c != a, OpenBLAS needs to copy a to c first. This operation // when c != a, OpenBLAS needs to copy a to c first. This operation
// slow down the speed, so just use OpenBLAS when c == a // slow down the speed, so just use OpenBLAS when c == a
if(useBLAS && c == a){ #if defined(USE_BLAS)
cblas_saxpy(a->unitNum,1,bp,1,cp,1); if( c == a){
AXPY(a->unitNum,beta,bp,1,cp,1);
} else{
int num = a->unitNum;
if (num % 4 == 0) {
for (int i = 0; i < num; i += 4) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
}
}
else if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
cp[i] = ap[i] + bp[i] * beta;
cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
}
}
else {
for (int i = 0; i < num; i++) {
cp[i] = ap[i] + bp[i] * beta;
}
}
} }
else{ #else
/* unrolling */ /* unrolling */
int num = a->unitNum; int num = a->unitNum;
if (num % 4 == 0) { if (num % 4 == 0) {
...@@ -111,8 +133,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -111,8 +133,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
cp[i] = ap[i] + bp[i] * beta; cp[i] = ap[i] + bp[i] * beta;
} }
} }
#endif
} }
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
...@@ -124,7 +146,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -124,7 +146,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
} }
} }
} }
/* /*
tensor summation a = a + b * \beta (do it on site) tensor summation a = a + b * \beta (do it on site)
keep the result in the tensor a and return nothing keep the result in the tensor a and return nothing
...@@ -138,7 +160,7 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta) ...@@ -138,7 +160,7 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
_Sum(a, b, a, beta); _Sum(a, b, a, beta);
} }
/* /*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h) return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor >> a - a tensor
>> b - another tensor for sum >> b - another tensor for sum
...@@ -166,7 +188,7 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b) ...@@ -166,7 +188,7 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
else else
return -1; return -1;
} }
/* /*
tensor summation c = a + b * \beta (return an XTensor structure) tensor summation c = a + b * \beta (return an XTensor structure)
make a new tensor c to keep the result and return it make a new tensor c to keep the result and return it
...@@ -186,7 +208,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -186,7 +208,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
if(n == -1){ if(n == -1){
/* call _Sum function */ /* call _Sum function */
_Sum(&a, &b, &c, beta); _Sum(&a, &b, &c, beta);
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUM); XLink::MakeLink(&a, &b, &c, MATH_SUM);
XLink::AddParamToHead(&c, beta); XLink::AddParamToHead(&c, beta);
...@@ -194,7 +216,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -194,7 +216,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else if(n >= 0 && n < a.order){ else if(n >= 0 && n < a.order){
/* call _SumDim function */ /* call _SumDim function */
_SumDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, beta);
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMDIM); XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
...@@ -203,7 +225,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -203,7 +225,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
else{ else{
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
return c; return c;
} }
......
...@@ -54,7 +54,6 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -54,7 +54,6 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bm = b->dimSize[1]; int bm = b->dimSize[1];
int cn = c->dimSize[0]; int cn = c->dimSize[0];
int cm = c->dimSize[1]; int cm = c->dimSize[1];
printf("4\n");
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm); GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
scale and shift all tensor entires scale and shift all tensor entires
b = a * scale + shift b = a * scale + shift
...@@ -71,8 +71,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -71,8 +71,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
else{ else{
DTYPE * va = (DTYPE*)a->data; DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data; DTYPE * vb = (DTYPE*)b->data;
if(shift == 0 && useBLAS && a==b){ #if defined(USE_BLAS)
cblas_sscal(b->unitNum, scale, vb, 1); if(shift == 0 && a==b){
SCAL(b->unitNum, scale, vb, 1);
} else{ } else{
for(int i = 0; i < b->unitNum; i++){ for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift; *vb = *va * scale + shift;
...@@ -80,10 +81,17 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -80,10 +81,17 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++; vb++;
} }
} }
#else
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
#endif
} }
} }
/* /*
scale and shift all tensor entires (do it on site) scale and shift all tensor entires (do it on site)
keep the result in the input tensor a and return nothing keep the result in the input tensor a and return nothing
...@@ -98,7 +106,7 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift) ...@@ -98,7 +106,7 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
_ScaleAndShift(a, a, scale, shift); _ScaleAndShift(a, a, scale, shift);
} }
/* /*
scale and shift all tensor entires (return an XTensor structure) scale and shift all tensor entires (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
...@@ -113,15 +121,15 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift) ...@@ -113,15 +121,15 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
{ {
XTensor b(&a); XTensor b(&a);
b.SetTMPFlag(); b.SetTMPFlag();
/* call _ScaleAndShift function */ /* call _ScaleAndShift function */
_ScaleAndShift(&a, &b, scale, shift); _ScaleAndShift(&a, &b, scale, shift);
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT); XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
XLink::AddParamToHead(&b, scale); XLink::AddParamToHead(&b, scale);
XLink::AddParamToHead(&b, shift); XLink::AddParamToHead(&b, shift);
return b; return b;
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
get the max value of the items along a dimension of the tensor get the max value of the items along a dimension of the tensor
>> input - the input tensor >> input - the input tensor
...@@ -37,23 +37,23 @@ get the max value of the items along a dimension of the tensor ...@@ -37,23 +37,23 @@ get the max value of the items along a dimension of the tensor
*/ */
void _ReduceMax(const XTensor * input, XTensor * output, int dim) void _ReduceMax(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!"); "This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!"); CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
CheckNTErrors(dimRDI >= 0, "Wrong dimension!"); CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
} }
...@@ -82,9 +82,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -82,9 +82,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
DTYPE * ip = (DTYPE*)input->data + blockSize * k; DTYPE * ip = (DTYPE*)input->data + blockSize * k;
DTYPE * op = (DTYPE*)output->data + stride * k; DTYPE * op = (DTYPE*)output->data + stride * k;
for(int i = 0; i < stride; i++){ for(int i = 0; i < stride; i++){
if(useBLAS){ //#if defined(USE_BLAS)
*(op + i) = cblas_isamax(strideNum, ip + i, stride); // *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
} else{ //#else
DTYPE max = FLOAT_MIN; DTYPE max = FLOAT_MIN;
DTYPE * ipe = ip + blockSize; DTYPE * ipe = ip + blockSize;
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){ for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
...@@ -93,13 +93,13 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -93,13 +93,13 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
max = v; max = v;
} }
*(op + i) = max; *(op + i) = max;
} //#endif
} }
} }
} }
} }
/* /*
get the max value of the items along a dimension of the tensor (return an XTensor structure). get the max value of the items along a dimension of the tensor (return an XTensor structure).
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
...@@ -110,7 +110,7 @@ make a new tensor to keep the result and return it ...@@ -110,7 +110,7 @@ make a new tensor to keep the result and return it
XTensor ReduceMax(const XTensor &input, int dim) XTensor ReduceMax(const XTensor &input, int dim)
{ {
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!"); CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1; int order = input.order - 1;
int * dimSize = new int[order]; int * dimSize = new int[order];
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
...@@ -126,7 +126,7 @@ XTensor ReduceMax(const XTensor &input, int dim) ...@@ -126,7 +126,7 @@ XTensor ReduceMax(const XTensor &input, int dim)
/* call _ReduceMax function */ /* call _ReduceMax function */
_ReduceMax(&input, &output, dim); _ReduceMax(&input, &output, dim);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX); XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
XLink::AddParamToHeadInt(&output, dim); XLink::AddParamToHeadInt(&output, dim);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
...@@ -25,10 +25,11 @@ ...@@ -25,10 +25,11 @@
#include "../../XName.h" #include "../../XName.h"
#include "../../XBLAS.h" #include "../../XBLAS.h"
#include "../arithmetic/XTensorBLAS.h" #include "../arithmetic/XTensorBLAS.h"
#include <iostream>
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
sum the items along a dimension of the tensor sum the items along a dimension of the tensor
For a 1-dimensional data array a, For a 1-dimensional data array a,
...@@ -44,7 +45,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true ...@@ -44,7 +45,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
*/ */
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp) void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{ {
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!"); "This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
...@@ -145,22 +146,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -145,22 +146,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
else{ else{
if(bias == 0){ if(bias == 0){
if(power == (DTYPE)1.0){ if(power == (DTYPE)1.0){
if(useBLAS) //#if defined(USE_BLAS)
sum = cblas_sasum(strideNum, ip + i, stride); // sum = ASUM(strideNum, ip + i, stride);
else //#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride) for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb; sum += *ipb;
//#endif
} }
else if(power == (DTYPE)2.0){ else if(power == (DTYPE)2.0){
if(useBLAS){ //#if defined(USE_BLAS)
sum = cblas_snrm2(strideNum, ip + i, stride); // sum = NRM2(strideNum, ip + i, stride);
sum = sum * sum; // sum = sum * sum;
} else{ //#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){ for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE value = (*ipb); DTYPE value = (*ipb);
sum += value * value; sum += value * value;
} }
} //#endif
} }
else if(power == (DTYPE)0.5){ else if(power == (DTYPE)0.5){
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){ for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
...@@ -177,11 +179,12 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -177,11 +179,12 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} }
else{ else{
if(power == (DTYPE)1.0){ if(power == (DTYPE)1.0){
if(useBLAS) //#if defined(USE_BLAS)
sum = cblas_sasum(strideNum, ip + i, stride); // sum = ASUM(strideNum, ip + i, stride);
else //#else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride) for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb; sum += *ipb;
//#endif
sum -= strideNum * bias; sum -= strideNum * bias;
} }
else if(power == (DTYPE)2.0){ else if(power == (DTYPE)2.0){
...@@ -210,7 +213,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -210,7 +213,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} }
} }
/* /*
sum the items along a dimension of the tensor (return an XTensor structure) sum the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
...@@ -228,7 +231,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true ...@@ -228,7 +231,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp) XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
{ {
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!"); CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1; int order = input.order - 1;
int * dimSize = new int[order]; int * dimSize = new int[order];
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
...@@ -244,7 +247,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow ...@@ -244,7 +247,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
/* call _ReduceSum function */ /* call _ReduceSum function */
_ReduceSum(&input, &output, dim, &shift, power, isExp); _ReduceSum(&input, &output, dim, &shift, power, isExp);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM); XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim); XLink::AddParamToHeadInt(&output, dim);
...@@ -257,7 +260,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow ...@@ -257,7 +260,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
return output; return output;
} }
/* /*
sum the items along a dimension of the tensor (return an XTensor structure) sum the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
...@@ -274,7 +277,7 @@ sum = \sum_i exp((a_i)^power) if isExp == true ...@@ -274,7 +277,7 @@ sum = \sum_i exp((a_i)^power) if isExp == true
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp) XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
{ {
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!"); CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1; int order = input.order - 1;
int * dimSize = new int[order]; int * dimSize = new int[order];
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
...@@ -290,7 +293,7 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp) ...@@ -290,7 +293,7 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
/* call _ReduceSum function */ /* call _ReduceSum function */
_ReduceSum(&input, &output, dim, NULL, power, isExp); _ReduceSum(&input, &output, dim, NULL, power, isExp);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM); XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim); XLink::AddParamToHeadInt(&output, dim);
......
差异被折叠。 点击展开。
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论