It makes COMP_CRC64() 10~30% faster than the marco written in C on my PentiumIII laptop.
But my Pentium4 box shows the MMX code is slower 2~3 times than plain C code.
What's happened in the processor?
#define COMP_CRC64_MMX(crc, data, len) do { uint64 __crc0 = (crc).crc0; unsigned char *__data = (unsigned char *) (data); uint32 __len = (len); while (__len-- > 0) { __asm__ __volatile__ ( "movq (%1),%%mm0;" /* __crc0 -> %%mm0 */ "movq %%mm0,%%mm4;" /* __crc0 -> %%mm4 */ "movq (%2),%%mm1;" /* *__data -> %%mm1 */ "movl %3,%%eax;" /* __crc64_const_vals */ "movq (%%eax),%%mm5;" /* load '56' */ "movq 8(%%eax),%%mm6;" /* load '0xff' */ "movq 16(%%eax),%%mm7;" /* load '8' */ "psrlq %%mm5,%%mm0;" /* __crc0(%%mm0) >> 56 */ "pxor %%mm1,%%mm0;" /* __crc0(%%mm0) ^ *data */ "pand %%mm6,%%mm0;" /* __crc0(%%mm0) & 0xff */ "mov %4,%%ebx;" /* crc_table */ "movd %%mm0,%%eax;" /* move __tab_index to the register */ "imul $8,%%eax;" /* 8 bytes per table entry */ "addl %%eax,%%ebx;" "movq (%%ebx),%%mm0;" /* crc_table[__tab_index] -> %%mm0 */ "psllq %%mm7,%%mm4;" /* %%mm4 << 8 */ "pxor %%mm4,%%mm0;" /* crc_table[__tab_index] ^ (__crc0 << 8) */ "movq %%mm0,%0;" "emms;" : "+g"(__crc0) : "r"(&__crc0), "r"(__data), "r"(__crc64_const_vals), "r"(crc_table) : "%eax", "%ebx" ); __data++; }; /* while() */ (crc).crc0 = __crc0; } while (0);
No comments:
Post a Comment