/* NiuTrans.Tensor - an open-source tensor library * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * * This is for streaming (on GPU), i.e., run jobs in different stream for * GPU Async capabilities. * * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09 * */ #ifndef __XSTREAM_H__ #define __XSTREAM_H__ /* the CUDA stuff */ #ifdef USE_CUDA #include <cuda_runtime.h> #include <cublas_v2.h> #include <cuda_fp16.h> #endif /* the nts (NiuTrans.Tensor) namespace */ namespace nts{ #define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128 /* This class defines the stream used in pipelining jobs. E.g., one can put a sequence of jobs in a stream and asychronously do something else. Basically we can use multiply streams to hide the data transfer cost on GPUs by using job overlaps. */ class XStream { public: #ifdef USE_CUDA /* the cuda stream */ cudaStream_t stream; /* list of cuda events for synchronize different streams */ cudaEvent_t * events; /* max number of the events */ int maxEventNum; /* number of used events */ int usedEventNum; #else /* virtual pointer */ void * stream; #endif /* device that holds the stream */ int devID; public: /* constructor */ XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM); /* deconstructor */ ~XStream(); /* create the stream */ void Create(int priority = 0, int devID = 0); /* destroy the stream */ void Destroy(); /* clear it */ void Clear(); /* judge if all the jobs in the stream have been finished */ bool IsFinished(); /* stream synchronize */ void StreamSynchronize(); /* thread synchronize */ static void ThreadSynchronize(); /* device synchronize */ static void DeviceSynchronize(int devID); /* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */ void MakeDependency(XStream * precedingStream); #ifdef USE_CUDA /* get the stream */ cudaStream_t * Get(); /* make a event */ cudaEvent_t * MakeEvent(); #endif }; } /* end of the nts (NiuTrans.Tensor) namespace */ #endif