Commit ee86e91d by 张裕浩

Update PTX code according to cuda10 documentation.

parent 44683b39
...@@ -41,19 +41,19 @@ float shflDownReduceMax(float input) ...@@ -41,19 +41,19 @@ float shflDownReduceMax(float input)
"{" "{"
".reg .f32 r0;" ".reg .f32 r0;"
".reg .pred p;" ".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;" "shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"setp.lt.f32 p,%1,r0;" "setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;" "@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;" "shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"setp.lt.f32 p,%1,r0;" "setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;" "@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;" "shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"setp.lt.f32 p,%1,r0;" "setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;" "@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;" "shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"setp.lt.f32 p,%1,r0;" "setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;" "@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;" "shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"setp.lt.f32 p, %1, r0; " "setp.lt.f32 p, %1, r0; "
"@p mov.f32 %1,r0;" "@p mov.f32 %1,r0;"
"mov.f32 %0,%1;" "mov.f32 %0,%1;"
...@@ -73,19 +73,19 @@ int shflDownReduceMax(int input) ...@@ -73,19 +73,19 @@ int shflDownReduceMax(int input)
"{" "{"
".reg .s32 r0;" ".reg .s32 r0;"
".reg .pred p;" ".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;" "shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"setp.lt.s32 p,%1,r0;" "setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;" "@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;" "shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"setp.lt.s32 p,%1,r0;" "setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;" "@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;" "shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"setp.lt.s32 p,%1,r0;" "setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;" "@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;" "shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"setp.lt.s32 p,%1,r0;" "setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;" "@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;" "shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"setp.lt.s32 p, %1, r0; " "setp.lt.s32 p, %1, r0; "
"@p mov.s32 %1,r0;" "@p mov.s32 %1,r0;"
"mov.s32 %0,%1;" "mov.s32 %0,%1;"
......
...@@ -37,15 +37,15 @@ float shflDownReduceSum(float input) ...@@ -37,15 +37,15 @@ float shflDownReduceSum(float input)
asm volatile( asm volatile(
"{" "{"
".reg .f32 r0;" ".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;" "shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.f32 %1, r0, %1;" "add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;" "shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.f32 %1, r0, %1;" "add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;" "shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.f32 %1, r0, %1;" "add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;" "shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.f32 %1, r0, %1;" "add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;" "shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.f32 %0, r0, %1;" "add.f32 %0, r0, %1;"
"}" "}"
: "=f"(output) : "f"(input)); : "=f"(output) : "f"(input));
...@@ -62,15 +62,15 @@ int shflDownReduceSum(int input) ...@@ -62,15 +62,15 @@ int shflDownReduceSum(int input)
asm volatile( asm volatile(
"{" "{"
".reg .s32 r0;" ".reg .s32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;" "shfl.sync.down.b32 r0, %1, 0x10, 0x1f,0xffffffff;"
"add.s32 %1, r0, %1;" "add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;" "shfl.sync.down.b32 r0, %1, 0x8, 0xf,0xffffffff;"
"add.s32 %1, r0, %1;" "add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;" "shfl.sync.down.b32 r0, %1, 0x4, 0x7,0xffffffff;"
"add.s32 %1, r0, %1;" "add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;" "shfl.sync.down.b32 r0, %1, 0x2, 0x3,0xffffffff;"
"add.s32 %1, r0, %1;" "add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;" "shfl.sync.down.b32 r0, %1, 0x1, 0x1,0xffffffff;"
"add.s32 %0, r0, %1;" "add.s32 %0, r0, %1;"
"}" "}"
: "=r"(output) : "r"(input)); : "=r"(output) : "r"(input));
......
...@@ -171,7 +171,7 @@ float broadcast(float input) ...@@ -171,7 +171,7 @@ float broadcast(float input)
float output; float output;
asm( asm(
"{" "{"
"shfl.idx.b32 %0,%1,0x0,0x1f;" "shfl.sync.idx.b32 %0,%1,0x0,0x1f,0xffffffff;"
"}" "}"
:"=f"(output) : "f"(input) :"=f"(output) : "f"(input)
); );
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论