MatrixMul.cpp 12.2 KB
Newer Older
xiaotong committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

22 23 24
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
xiaotong committed
25 26 27
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
28
#include "MatrixMulBatched.h"
xiaotong committed
29 30 31 32

namespace nts { // namespace nts(NiuTrans.Tensor)

/*
33
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
34 35 36 37 38 39 40 41 42 43

For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
C should be a tensor of z * x * n * m. 
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.

xiaotong committed
44 45 46 47 48 49 50 51
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
52
void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
53 54
                const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
xiaotong committed
55
{
56 57
    CheckNTErrors(a && b && c, "Empty input tensors!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
58
                  "Input tensors should have the same data type!");
59
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
60
                  "Input tensors must have a order >= 2!");
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
    CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
    
    /* we transform a higher order tensor to a matrix to kill the number
       of calls of matrix multiplication */
    if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
        int ncolA = a->dimSize[a->order - 1];
        int ncolC = c->dimSize[c->order - 1];
        XTensor * a2 = NewTensor2D(a->unitNum/ncolA, -ncolA, a->dataType, a->devID, a->mem);
        XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
        a2->data = a->data;
        c2->data = c->data;
        _MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
        a2->data = NULL;
        c2->data = NULL;
        delete a2;
        delete c2;
        return;
    }
xiaotong committed
79

80 81 82 83 84 85
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];
xiaotong committed
86

87
    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
xiaotong committed
88 89 90 91 92 93 94 95 96 97 98 99

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int aBlockNum = 1;
    int bBlockNum = 1;
    int cBlockNum = 1;

    for (int i = 2; i < a->order; i++) {
100
        CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
xiaotong committed
101 102 103 104 105
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }

    for (int i = 2; i < b->order; i++) {
106
        CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
xiaotong committed
107 108 109 110 111 112 113
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }

    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
xiaotong committed
114 115 116
    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
xiaotong committed
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151

    bool isSparseMul = false;

    for (int p = 0; p < aBlockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
        for (int q = 0; q < bBlockNum; q++) {
            void * bp = (char*)b->data + bRealBlockSize * q;
            void * cp = (char*)c->data + cRealBlockSize * (p * bBlockNum + q);

            CheckNTErrors((bRealBlockSize * q < b->unitNum * b->unitSize), "Something wrong!");
            CheckNTErrors((cRealBlockSize * (p * bBlockNum + q) < c->unitNum * c->unitSize), "Something wrong!");

            XTensor * ai = NewTensor(2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
            XTensor * bi = NewTensor(2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
            XTensor * ci = NewTensor(2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
            ai->data = ap;
            bi->data = bp;
            ci->data = cp;
            aList->Add(ai);
            bList->Add(bi);
            cList->Add(ci);
            if (a->isSparse || b->isSparse) {
                CheckNTErrors((a->order == 2 && b->order == 2), "TODO!");
                ai->unitNumNonZero = a->unitNumNonZero;
                bi->unitNumNonZero = b->unitNumNonZero;
                isSparseMul = true;
            }
        }
    }

    if (isSparseMul) {
        for (int i = 0; i < aList->count; i++) {
            XTensor * ai = (XTensor*)aList->GetItem(i);
            XTensor * bi = (XTensor*)bList->GetItem(i);
            XTensor * ci = (XTensor*)cList->GetItem(i);
152
            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
xiaotong committed
153 154 155 156 157 158 159 160 161 162 163
        }
    }
    else if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
#ifdef USE_CUDA
        CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
                      "The code must be run on the same GPU!");
        
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);

        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
164
        _CudaBLASMatrixMULList(handle,
165 166 167 168
                               aList, transposedA,
                               bList, transposedB,
                               cList, aList->count,
                               alpha, beta);
xiaotong committed
169 170 171 172 173 174 175 176

        BacktoCudaDev(a->devID, devIDBackup);
#else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
177 178 179
        _MatrixMulBatchedCPU(aList, transposedA,
                             bList, transposedB,
                             cList, alpha, beta);
xiaotong committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
    }

    for (int i = 0; i < aList->count; i++) {
        XTensor * ai = (XTensor*)aList->GetItem(i);
        ai->data = NULL;
        delete ai;
    }

    for (int i = 0; i < bList->count; i++) {
        XTensor * bi = (XTensor*)bList->GetItem(i);
        bi->data = NULL;
        delete bi;
    }

    for (int i = 0; i < cList->count; i++) {
        XTensor * ci = (XTensor*)cList->GetItem(i);
        ci->data = NULL;
        delete ci;
    }

    delete aList;
    delete bList;
    delete cList;
}
204 205

/*
206
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
make a new tensor to keep the result and return it

For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
The result C should be a tensor of z * x * n * m. 
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
226 227 228
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                  DTYPE alpha, XPRunner * parallelRunner)
229
{
230 231
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
232 233 234 235 236 237 238 239 240 241 242

    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
243 244
    for (int i = 2; i < a.order; i++)
        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
245 246
    for (int i = 2; i < b.order; i++)
        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
247 248 249
    dimSize[sub++] = an;
    dimSize[sub++] = bm;

250 251
    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
252
    c.SetTMPFlag();
253 254

    /* call _MatrixMul function */
255
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
256 257 258 259 260 261

    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
    XLink::AddParamToHeadTrans(&c, transposedA);
    XLink::AddParamToHeadTrans(&c, transposedB);
    XLink::AddParamToHead(&c, alpha);
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301

    /* destroy variables */
    delete[] dimSize;

    return c;
}

/* 
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha, XPRunner * parallelRunner)
{
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");

    int an = a.dimSizeRDI[1];
    int am = a.dimSizeRDI[0];
    int bn = b.dimSizeRDI[1];
    int bm = b.dimSizeRDI[0];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
    for (int i = 2; i < a.order; i++)
        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
    for (int i = 2; i < b.order; i++)
        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
302
    c.SetTMPFlag();
303 304 305 306 307 308 309 310 311

    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
    XLink::AddParamToHead(&c, alpha);
312 313

    /* destroy variables */
xiaotong committed
314
    delete[] dimSize;
315 316 317 318

    return c;
}

xiaotong committed
319
} // namespace nts(NiuTrans.Tensor)
320 321 322