Commit ee86e91d by 张裕浩

Update PTX code according to cuda10 documentation.

parent 44683b39
......@@ -41,19 +41,19 @@ float shflDownReduceMax(float input)
"{"
".reg .f32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"setp.lt.f32 p, %1, r0; "
"@p mov.f32 %1,r0;"
"mov.f32 %0,%1;"
......@@ -73,19 +73,19 @@ int shflDownReduceMax(int input)
"{"
".reg .s32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"setp.lt.s32 p, %1, r0; "
"@p mov.s32 %1,r0;"
"mov.s32 %0,%1;"
......
......@@ -37,15 +37,15 @@ float shflDownReduceSum(float input)
asm volatile(
"{"
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.f32 %0, r0, %1;"
"}"
: "=f"(output) : "f"(input));
......@@ -62,15 +62,15 @@ int shflDownReduceSum(int input)
asm volatile(
"{"
".reg .s32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.s32 %0, r0, %1;"
"}"
: "=r"(output) : "r"(input));
......
......@@ -171,7 +171,7 @@ float broadcast(float input)
float output;
asm(
"{"
"shfl.idx.b32 %0,%1,0x0,0x1f;"
"shfl.sync.idx.b32 %0,%1,0x0,0x1f,0xffffffff;"
"}"
:"=f"(output) : "f"(input)
);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论