玫瑰幻想 发表于 2013-11-21 12:13
楼主还编译成功,得到ptx?
那无可能的。
我确实没有说谎,为了确定你说的结论,我再一次编译(没写-arch 或者补上 -arch compute_10)确实生成了ptx文件
cu里用到了下面三个函数
__device__ int absdiff4(unsigned int a, unsigned int b, int previous_sum)
{
int sum;
asm ("vabsdiff4.s32.u32.u32.add %0, %1, %2, %3;":"=r"(sum):"r"(a),"r"(b),"r"(previous_sum));
//sum += abs((int)(a &0xff) - (int)(b &0xff));
//sum += abs((int)(a>>8 &0xff) - (int)(b>>8 &0xff));
//sum += abs((int)(a>>16&0xff) - (int)(b>>16&0xff));
//sum += abs((int)(a>>24&0xff) - (int)(b>>24&0xff));
return sum;
}
__device__ void abs_avg(unsigned int a, unsigned int b, unsigned int c, int &sum)
{
asm volatile ("vavrg4.u32.u32.u32 %0, %0, %1, %0;" : "+r"(b) : "r"(c));
asm volatile ("vabsdiff4.s32.u32.u32.add %0, %1, %2, %0;" : "+r"(sum) : "r"(a), "r"(b));
//sum += abs( (((int)(c &0xff) + (int)(b &0xff) + 1) >>1) - (int)(a &0xff));
//sum += abs( (((int)(c>>8 &0xff) + (int)(b>>8 &0xff) + 1) >>1) - (int)(a>>8 &0xff));
//sum += abs( (((int)(c>>16&0xff) + (int)(b>>16&0xff) + 1) >>1) - (int)(a>>16&0xff));
//sum += abs( (((int)(c>>24&0xff) + (int)(b>>24&0xff) + 1) >>1) - (int)(a>>24&0xff));
}
__device__ void avg(unsigned int &b, unsigned int c ) // b = (b+c)/2 by byte.
{
asm volatile ("vavrg4.u32.u32.u32 %0, %0, %1, %0;" : "+r"(b) : "r"(c));
//b = ((uint32_t)(b&0xff)+(uint32_t)(c&0xff)+1) >> 1 |
// ((uint32_t)(b>>8 &0xff)+(uint32_t)(c>>8 &0xff)+1) >> 1 <<8 |
// ((uint32_t)(b>>16&0xff)+(uint32_t)(c>>16&0xff)+1) >> 1 <<16 |
// ((uint32_t)(b>>24&0xff)+(uint32_t)(c>>24&0xff)+1) >> 1 <<24;
} |