Apparently GCC has vector types for C language. Moreover, GCC 4.7 allows
to use vector generating element while performing operations with vector types.
Suppose we have
typedef int intvect __attribute__ ((vector_size (32)))
intvec a, b = {1,2,3,4,5,6,7,8};
Operation with vector generating element will be:
a = b + 2;
which literally means:
vector a{2,2,2,2,2,2,2,2} + vector b{1,2,3,4,5,6,7,8}
As it often happens, the neat stuff is hidden behind. Part of generated
assembler code:
movl $0x1,0x48(%rsp)
movl $0x2,0x4c(%rsp)
movl $0x3,0x50(%rsp)
movl $0x4,0x54(%rsp)
movl $0x5,0x58(%rsp)
movl $0x6,0x5c(%rsp)
movl $0x7,0x60(%rsp)
movl $0x8,0x64(%rsp)
movdqa 0x48(%rsp),%xmm1
movl $0x2,-0x38(%rsp)
movl $0x2,-0x34(%rsp)
movl $0x2,-0x30(%rsp)
movl $0x2,-0x2c(%rsp)
movl $0x2,-0x28(%rsp)
movl $0x2,-0x24(%rsp)
movl $0x2,-0x20(%rsp)
movl $0x2,-0x1c(%rsp)
movdqa -0x38(%rsp),%xmm0
paddd %xmm0,%xmm1
movdqa 0x58(%rsp),%xmm2
movl $0x2,-0x58(%rsp)
movl $0x2,-0x54(%rsp)
movl $0x2,-0x50(%rsp)
movl $0x2,-0x4c(%rsp)
movl $0x2,-0x48(%rsp)
movl $0x2,-0x44(%rsp)
movl $0x2,-0x40(%rsp)
movl $0x2,-0x3c(%rsp)
I really like how they utilize CPU's out-of-order and data prefetching features
by filling pipeline with mov-s with high probability of simultaneous execution
instead of several loops.
"The core's ability to execute instructions out of order is a key factor in enabling
parallelism. This feature enables the processor to reorder instructions so that if
one µop is delayed while waiting for data or a contended resource, other µops that
appear later in the program order may proceed. This implies that when one portion
of the pipeline experiences a delay, the delay may be covered by other operations
executing in parallel or by the execution of µops queued up in a buffer."
Good example is glibc's strncmp:
STRNCMP (const char *s1, const char *s2, size_t n)
{
unsigned char c1 = '\0';
unsigned char c2 = '\0';
if (n >= 4)
{
size_t n4 = n >> 2;
do
{
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0' || c1 != c2)
return c1 - c2;
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0' || c1 != c2)
return c1 - c2;
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0' || c1 != c2)
return c1 - c2;
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0' || c1 != c2)
return c1 - c2;
} while (--n4 > 0);
n &= 3;
}
while (n > 0)
{
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '\0' || c1 != c2)
return c1 - c2;
n--;
}
return c1 - c2;
}