Commit 218fdfd8 by Tianzhi

finish reduce max

parent 569cb2dd
......@@ -22,6 +22,7 @@
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XBLAS.h"
#include "./VectorBuffer.h"
#include "../arithmetic/XTensorBLAS.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
......@@ -78,14 +79,73 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
}
blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){
//data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){
// stride = 1
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + i;
VectorBuffer vecBuf[4];
for(int j = 0; j < 4; j++){
// std::cout << isExp << " " << power << " " << bias[0] << std::endl;
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength);
}
for(int j = 1; j < strideNum / 32; j++){
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
vecBuf[0] = vecBuf[0].max(VectorBuffer::loadu(ptr + 0 * vecBufLength));
vecBuf[1] = vecBuf[1].max(VectorBuffer::loadu(ptr + 1 * vecBufLength));
vecBuf[2] = vecBuf[2].max(VectorBuffer::loadu(ptr + 2 * vecBufLength));
vecBuf[3] = vecBuf[3].max(VectorBuffer::loadu(ptr + 3 * vecBufLength));
}
vecBuf[0] = vecBuf[0].max(vecBuf[1]);
vecBuf[0] = vecBuf[0].max(vecBuf[2]);
vecBuf[0] = vecBuf[0].max(vecBuf[3]);
DTYPE maxN = DTYPE_MIN;
for(int k = 0; k < vecBufLength; k++){
maxN = std::max(maxN,vecBuf[0][k]);
}
*op = maxN;
}
} else{
//data is separated
for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i;
VectorBuffer vecBuf[4];
for(int k = 0; k < 4; k++){
vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));
}
for(int k = 1; k < strideNum; k++){
DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
vecBuf[0] = vecBuf[0].max(VectorBuffer::loadu(ptr + 0 * vecBufLength));
vecBuf[1] = vecBuf[1].max(VectorBuffer::loadu(ptr + 1 * vecBufLength));
vecBuf[2] = vecBuf[2].max(VectorBuffer::loadu(ptr + 2 * vecBufLength));
vecBuf[3] = vecBuf[3].max(VectorBuffer::loadu(ptr + 3 * vecBufLength));
}
for(int k = 0; k < 4; k++){
for(int l = 0; l < vecBufLength; l++)
*(op + j * 32 + 8 * k + l) = vecBuf[k][l];
}
}
}
}
}//run vector buffer
else{
for(int k = 0; k < blockNum; k++){
DTYPE * ip = (DTYPE*)input->data + blockSize * k;
DTYPE * op = (DTYPE*)output->data + stride * k;
for(int i = 0; i < stride; i++){
//#if defined(USE_BLAS)
// *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
//#else
DTYPE max = FLOAT_MIN;
//#if defined(USE_BLAS)
// *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
//#else
DTYPE max = DTYPE_MIN;
DTYPE * ipe = ip + blockSize;
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE v = *ipb;
......@@ -93,7 +153,8 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
max = v;
}
*(op + i) = max;
//#endif
//#endif
}
}
}
}
......
#include <cstring>
#include <cmath>
#include <algorithm>
#include "../../XGlobal.h"
......@@ -20,7 +21,7 @@ class VectorBuffer{
int count = 32 / sizeof(DTYPE);
VectorBuffer vec;
if(isExp){
if(bias == 0){
if(bias == NULL){
if(power == (DTYPE)1.0){
for (int i = 0; i != count; i++) {
vec.values[i] = (DTYPE)std::exp(*(ptr + i));
......@@ -38,7 +39,7 @@ class VectorBuffer{
vec.values[i] = (DTYPE)std::exp(std::pow(*(ptr + i), power));
}
}
}//is bias == 0
}//is bias == NULL
else{
if(power == (DTYPE)1.0){
for (int i = 0; i != count; i++) {
......@@ -61,7 +62,7 @@ class VectorBuffer{
}
}//isExp
else{
if(bias == 0){
if(bias == NULL){
if(power == (DTYPE)1.0){
std::memcpy(vec.values, ptr, count * sizeof(DTYPE));
} else if(power == (DTYPE)2.0){
......@@ -77,7 +78,7 @@ class VectorBuffer{
vec.values[i] = (DTYPE)std::pow(*(ptr + i), power);
}
}
}// if bias == 0
}// if bias == NULL
else{
if(power == (DTYPE)1.0){
for (int i = 0; i != count; i++) {
......@@ -104,10 +105,22 @@ class VectorBuffer{
const DTYPE& operator[](int idx) const {
return values[idx];
}
VectorBuffer operator+(const VectorBuffer &a) {
inline VectorBuffer operator+(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = a[i] + this->values[i];
}
return *this;
}
inline VectorBuffer max(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = std::max(a[i], this->values[i]);
}
return *this;
}
inline VectorBuffer min(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = std::min(a[i], this->values[i]);
}
return *this;
}
};
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论