Loss.cu 14.6 KB
Newer Older
xiaotong committed
1
/* NiuTrans.Tensor - an open-source tensor library
liyinqiao committed
2
 * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
xiaotong committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

#include "Loss.h"
#include "Loss.cuh"
#include "../XDevice.h"
25
#include "../core/math/ScaleAndShift.h"
26
#include "../core/math/Unary.h"
liyinqiao committed
27
#include "../core/math/Binary.h"
28 29 30 31
#include "../core/arithmetic/Sum.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/movement/CopyValues.h"
liyinqiao committed
32
#include "../core/shape/IsSameShaped.h"
xiaotong committed
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

namespace nts{ // namespace nts(NiuTrans.Tensor)

#ifdef USE_CUDA

/*
loss function to measure the "number" of errors
*/

/* 
compute the loss 
>> gold - gold standard
>> y - model prediction
>> LFName - name of loss function
>> isLogOutput - is the output in log scale?
>> leadDim - the leading dimension for the output
>> gBeg - where to start in the gold standard (along the leading dimension)
>> gLen - segment length from oBeg (along the leading dimension)
>> yBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
54
DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
xiaotong committed
55 56
                      bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
{
57
    CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
liyinqiao committed
58 59
    CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
    CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
60
    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
liyinqiao committed
61
    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
62 63 64 65 66
    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
    CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
    CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");

    if(isLogOutput)
67
        return _LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);
68 69 70 71 72 73 74 75 76

    DTYPE error = 0.0F;

    /* 
    squared error 
    loss = sum_{i} 0.5*(gold_i - output_i)^2
    where gold_i is the gold standard and output_i is the model prediction
    */
    if(LFName == SQUAREDERROR){
liyinqiao committed
77
        XTensor * diff = NewTensorV2(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
78
        _Sum(gold, y, diff, -1.0F);
79
        _PowerMe(diff, 2.0F);
80
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);
81 82 83 84 85 86

        int reduceTimes = diff->order;
        for (int i = 0; i < reduceTimes; i++) {
            int diffOrder = diff->order - 1;
            int * diffDimSize = new int[diffOrder];
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
liyinqiao committed
87
            XTensor * diffNew = NewTensorV2(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
88
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
89
            _ReduceSum(diff, diffNew, reducePlace);
90 91 92 93
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
liyinqiao committed
94
                diffNew->dimSize[diffNew->order - 2] = 1;
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
            }
            delete diff;
            diff = diffNew;
            delete diffDimSize;
        }
        error = diff->Get2D(0, 0);
        delete diff;
    }

    /* 
    cross entropy
    loss = sum_{i} (-gold_i * log(output_i))
    where gold and output are distributions 
    */
    if(LFName == CROSSENTROPY){
liyinqiao committed
110
        XTensor * diff = NewTensorV2(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
111
        _CopyValues(y, diff);
112
        _LogMe(diff);
113
        _Multiply(gold, diff, diff);
114
        _NegateMe(diff);
115 116 117 118 119 120

        int reduceTimes = diff->order;
        for (int i = 0; i < reduceTimes; i++) {
            int diffOrder = diff->order - 1;
            int * diffDimSize = new int[diffOrder];
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
liyinqiao committed
121
            XTensor * diffNew = NewTensorV2(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
122
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
123
            _ReduceSum(diff, diffNew, reducePlace);
124 125 126 127
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
liyinqiao committed
128
                diffNew->dimSize[diffNew->order - 2] = 1;
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
            }
            delete diff;
            diff = diffNew;
            delete diffDimSize;
        }
        error = diff->Get2D(0, 0);
        delete diff;
    }
    
    /*
    one hot error
    loss = sum_{i} e_i 
    where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, 
          e_i = 0 otherwise
    */
    if(LFName == ONEHOTERROR){
liyinqiao committed
145 146
        XTensor * diff = NewTensorV2(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
        XTensor * yOnehot = NewTensorV2(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
147
        _CopyValues(y, yOnehot);
148
        _Multiply(gold, y, yOnehot);
149
        _Sum(gold, yOnehot, diff, -1.0F);
150
        _PowerMe(diff, 2.0F);
151
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);
152 153 154 155 156 157

        int reduceTimes = diff->order;
        for (int i = 0; i < reduceTimes; i++) {
            int diffOrder = diff->order - 1;
            int * diffDimSize = new int[diffOrder];
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
liyinqiao committed
158
            XTensor * diffNew = NewTensorV2(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
159
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
160
            _ReduceSum(diff, diffNew, reducePlace);
161 162 163 164
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
liyinqiao committed
165
                diffNew->dimSize[diffNew->order - 2] = 1;
166 167 168 169 170 171 172 173 174 175
            }
            delete diff;
            diff = diffNew;
            delete diffDimSize;
        }
        error = diff->Get2D(0, 0);
        delete diff;
        delete yOnehot;
    }
    return error;
xiaotong committed
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191

    // TODO: call cuda kernels for computing the errors
}

/* 
the log version of loss computation

>> gold - gold standard
>> y - model prediction
>> LFName - name of loss function
>> leadDim - the leading dimension for the output
>> gBeg - where to start in the gold standard (along the leading dimension)
>> gLen - segment length from oBeg (along the leading dimension)
>> yBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
192
DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * y, 
xiaotong committed
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
                                 LOSS_FUNCTION_NAME LFName,
                                 int leadDim, int gBeg, int gLen, int yBeg)
{
    return 0;

    // TODO: call cuda kernels for computing the errors
}

/* 
backward compuation for a single element (Cuda version)
dE/dy
where E is the error(loss) function that measure the errors in y
with respect to gold standard, and y this the model output
>> t - gold standard
>> y - model output
>> LFName - name of loss function
<< return dE/dy
*/
211
DTYPE _CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
xiaotong committed
212
{
213
    return _LossBackwardPoint(t, y, LFName);
xiaotong committed
214 215 216 217 218 219 220 221 222 223 224
   
    // TODO: call cuda kernels for computing the errors
}

/* 
backward compuation for squared error (Cuda kernel)
>> dedy - dE/dy (for return)
>> t - gold standard (in vector)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
225
__global__ 
xiaotong committed
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size){
        dedy[i] = y[i] - t[i];
    }
}

/* 
backward compuation of blocks for squared error (Cuda kernel)
>> dedy - dE/dy (for return)
>> t - gold standard (in vector)
>> y - model output (in vector)
>> blockSize - size of a block
>> begInBlock - the begining position in a block for computation 
>> lenInBlock - number of items in a block for computation 
>> size - size of the vector (dedy)
*/
245
__global__ 
xiaotong committed
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, 
                                         int blockSize, int begInBlock, int lenInBlock, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    int offset = i % blockSize;

    if(offset < begInBlock || offset >= begInBlock + lenInBlock)
        return;

    if (i < size){
        dedy[i] = y[i] - t[i];
    }
}

/* 
backward compuation for cross entropy (Cuda kernel)
>> dedy - dE/dy (for return)
>> t - gold standard (in vector)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
268
__global__ 
269
void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
xiaotong committed
270 271
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
272 273
    if (i > stride * dimensionSize * blockNum) 
        return;
xiaotong committed
274

275 276 277 278 279 280 281 282 283 284 285
    int blockNumIndex = i / (stride * dimensionSize);
    int blockNumTail = i % (stride * dimensionSize);
    int dimensionSizeIndex = blockNumTail / stride;
    int strideIndex = blockNumTail % stride;

    if (dimensionSizeIndex >= tLen)
        return;

    dedy[blockNumIndex * stride * dimensionSize + strideIndex + stride * (yBeg + dimensionSizeIndex)] = -t[blockNumIndex * stride * dimensionSize + 
        strideIndex + stride * (tBeg + dimensionSizeIndex)] / y[blockNumIndex * stride * dimensionSize + strideIndex + stride * (yBeg + dimensionSizeIndex)];
    /*if (i < size){
xiaotong committed
286
        dedy[i] =  -t[i]/y[i];
287
    }*/
xiaotong committed
288 289 290 291 292 293 294 295 296 297 298 299
}

/* 
backward compuation for cross entropy (Cuda kernel)
>> dedy - dE/dy (for return)
>> t - gold standard (in vector)
>> y - model output (in vector)
>> blockSize - size of a block
>> begInBlock - the begining position in a block for computation 
>> lenInBlock - number of items in a block for computation 
>> size - size of the vector (dedy)
*/
300
__global__ 
xiaotong committed
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, 
                                         int blockSize, int begInBlock, int lenInBlock, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    int offset = i % blockSize;

    if(offset < begInBlock || offset >= begInBlock + lenInBlock)
        return;

    if (i < size){
        dedy[i] =  -t[i]/y[i];
    }
}

/* 
backward compuation for (dense) vectors (Cuda version)
dE/dy
where E is the error(loss) function that measure the errors in y
with respect to gold standard, and y this the model output
>> dedy - dE/dy (for return)
>> t - gold standard (in vector)
>> y - model output (in vector)
>> LFName - name of loss function
>> leadDim - the leading dimension for the output
>> tBeg - where to start in the gold standard (along the leading dimension)
>> tLen - segment length from oBeg (along the leading dimension)
>> yBeg - where to start in the model output (along the leading dimension)
*/
330
void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
xiaotong committed
331 332 333
                      LOSS_FUNCTION_NAME LFName, 
                      int leadDim, int tBeg, int tLen, int yBeg)
{
334
    CheckNTErrors((tLen <= y->unitNum), "Illegal input length!");
liyinqiao committed
335
    CheckNTErrors((_IsSameShaped(t, y)&& _IsSameShaped(dedy, y)), 
336 337 338
                  "The input tensors must be of the same size!");
    CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)), 
                  "Tensor must be on the same device!");
339
    CheckNTErrors((t->order > leadDim), "Illegal leading dimension!");
xiaotong committed
340
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && 
341 342 343
                   y->dataType == DEFAULT_DTYPE && 
                   dedy->dataType == DEFAULT_DTYPE),
                  "Input vectors are not in default type.");
xiaotong committed
344 345

    CheckNTErrors((dedy->devID >= 0 && t->devID >= 0 && y->devID >= 0),
346
                  "The backward compuation must be performed on GPUs.");
xiaotong committed
347 348

    CheckNTErrors((dedy->devID == t->devID && dedy->devID == y->devID),
349
                  "The vectors must be on the same GPU.");
xiaotong committed
350 351
    CheckNTErrors((tBeg == yBeg), "TODO!");

liyinqiao committed
352 353
    if (leadDim < 0) {
        leadDim = 0;
xiaotong committed
354 355
        tBeg = 0;
        yBeg = 0;
liyinqiao committed
356
        tLen = y->dimSize[leadDim];
xiaotong committed
357 358
    }

liyinqiao committed
359
    int dimensionSize = y->dimSize[leadDim];
xiaotong committed
360 361
    int stride = 1;
    int blockSize = 1;
362
    int blockNum = 1;
xiaotong committed
363 364
    int size = 1;

liyinqiao committed
365 366
    for(int i = leadDim + 1; i < y->order; i++)
        stride *= y->dimSize[i];
xiaotong committed
367
    size = tLen * stride;
368 369
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;
xiaotong committed
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

    int cudaGridSize[3], cudaBlockSize[3];

    GDevs.GetCudaThread(dedy->devID, y->unitNum, cudaGridSize, cudaBlockSize);

    dim3 blocks(cudaGridSize[0]);
    dim3 threads(cudaBlockSize[0]);

    DTYPE * tp = (DTYPE*)t->data;
    DTYPE * yp = (DTYPE*)y->data;
    DTYPE * dedyp = (DTYPE*)dedy->data;

    int devIDBackup;
    ProtectCudaDev(y->devID, devIDBackup);

    /* 
    squared error 
    loss = sum_{i} 0.5*(t_i - y_i)^2, where t_i is the gold standard and y_i is the model output
    dloss/dy_i = y_i - t_i
    */
    if(LFName == SQUAREDERROR){
        if(t->isSparse){
            ShowNTErrors("TODO!");
        }
        else if(size == y->unitNum){
            KernelLossBackwardSquaredError<<<blocks, threads>>>(dedyp, tp, yp, y->unitNum);
        }
        else{
            KernelLossBackwardSquaredErrorBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
        }
    }

    /* 
    cross entropy
    loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions 
    dloss/dy_i = -t_i / y_i
    */
407
    else if(LFName == CROSSENTROPY){
xiaotong committed
408 409 410 411
        if(t->isSparse){
            ShowNTErrors("TODO!");
        }
        else if(size == y->unitNum){
412
            KernelLossBackwardCrossEntropy<<<blocks, threads>>>(dedyp, tp, yp, tBeg, tLen, yBeg, blockNum, stride, dimensionSize);
xiaotong committed
413 414 415 416 417
        }
        else{
            KernelLossBackwardCrossEntropyBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
        }
    }
418 419 420
    else{
        ShowNTErrors("TODO");
    }
xiaotong committed
421 422 423 424 425 426 427

    BacktoCudaDev(y->devID, devIDBackup);
}

#endif

} // namespace nts(NiuTrans.Tensor)