A co-worker sent me a link to AA-sort some time ago, but I didn't have any reason to use it until just recently. I found myself in a situation where I have 2K~4K 4-byte elements packet into qwords and my initial O(N^2) solution just wasn't working out. So, I decided to try the O(N log N) "in-core" component of AA-sort as a pre-sort.

The paper linked above is relatively straight-forward, but they only provide pseudo-code so there's still a bit of effort required to get something usable. As per the author's goals, it's easily written as highly-vectorized, non-branching code and was rather fun to get working.

The authors were working with 8 16-bit values packed into a qword, and I had to make some simple changes to get it working with vec_uint4s.

First, my bitonic_sort() routine takes care of step one of their in-core algorithm by sorting the 4 uint32_t elements in a qword:

inline vec_uint4 bitonic_sort( vec_uint4 Vec_ )
{
static const vec_uchar16 BADC = {
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
};
static const vec_uchar16 CDAB = {
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
};
static const vec_uint4 mask1 = {
0x00000000, 0xffffffff, 0xffffffff, 0x00000000
};
static const vec_uint4 mask2 = {
0x00000000, 0x00000000, 0xffffffff, 0xffffffff
};
static const vec_uint4 mask3 = {
0x00000000, 0xffffffff, 0x00000000, 0xffffffff
};
vec_uint4 temp, cmp, result;

temp = spu_shuffle( Vec_, Vec_, BADC ); // Compare A to B and C to D
cmp = spu_cmpgt( Vec_, temp );
cmp = spu_xor( cmp, mask1 ); // Order A/B increasing, C/D decreasing
result = spu_sel( Vec_, temp, cmp );

temp = spu_shuffle( result, result, CDAB ); // Compare AB to CD
cmp2 = spu_cmpgt( result, temp );
cmp2 = spu_xor( cmp2, mask2 ); // Order AB/CD increasing
result = spu_sel( result, temp, cmp2 );
temp = spu_shuffle( result, result, BADC ); // Compare previous A to B and C to D
cmp = spu_cmpgt( result, temp );
cmp = spu_xor( cmp, mask3 ); // Order A/B and C/D increasing
result = spu_sel( result, temp, cmp );

return result;
}

Next come cmpswap() and cmpswap_skew() from the paper:

inline vec_uint4 vector_cmpswap( vec_uint4* A_, vec_uint4* B_ )
{
const vec_uint4 a = *A_;
const vec_uint4 b = *B_;
const vec_uint4 cmp = spu_cmpgt( a, b );
*A_ = spu_sel( a, b, cmp );
*B_ = spu_sel( b, a, cmp );

return cmp;
}

inline vec_uint4 vector_cmpswap_skew( vec_uint4* A_, vec_uint4* B_ )
{
const vec_uint4 a = *A_;
const vec_uint4 b = *B_;

// The last word compared is 0xffff so that part of the mask should always be 0000
static const vec_uchar16 BCDx = {
0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0xc0, 0xc0, 0xc0, 0xc0
};
const vec_uint4 bShift = spu_shuffle( b, b, BCDx );
const vec_uint4 cmp = spu_cmpgt( a, bShift );
const vec_uint4 swap = spu_sel( a, bShift, cmp );
const vec_uint4 bSwap = spu_sel( bShift, a, cmp );

static const vec_uchar16 abcD = {
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f
};
static const vec_uchar16 Aabc = {
0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b
};
*A_ = spu_shuffle( a, swap, abcD );
*B_ = spu_shuffle( b, bSwap, Aabc );

return cmp;
}

In both functions I return the comparison mask because I think it's necessary to implement step 2 of AA-sort's in-core algorithm; the modified combsort:

static const float INV_SHRINK_FACTOR = 1.f/1.3f;
uint32_t gap = uint32_t(float(edgeCount) * INV_SHRINK_FACTOR);
while (gap > 1)
{
// Straight comparisons
const uint32_t maxSwapIndex = edgeCount - gap;
for (uint32_t i = 0; i < maxSwapIndex; ++i)
{
vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap] );
}

// Skewed comparisons for when i+gap exceeds N/4
for (uint32_t i = edgeCount - gap; i < edgeCount; ++i)
{
vector_cmpswap_skew( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap - edgeCount] );
}

gap = uint32_t((float)gap * INV_SHRINK_FACTOR);
}
vec_uint4 not_sorted;
do
{
not_sorted = spu_splats((uint32_t)0);
for (uint32_t i = 0; i < edgeCount - 1; ++i)
{
not_sorted = spu_or( not_sorted, vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+1] ) );
}
not_sorted = spu_or( not_sorted, vector_cmpswap_skew( (vec_uint4*)&work[edgeCount - 1], (vec_uint4*)&work[0] ) );
not_sorted = spu_orx( not_sorted );
} while ( spu_extract(not_sorted, 0) );

I use a shrink factor of 1.3 as suggested by the authors as well the combsort page at Wikipedia. Standard combsort stops looping once gap is less-or-equal to 1 and no swaps have been performed in the current iteration. AA-sort splits the gap and "swap" loops so I use the not_sorted value to keep track of when swaps stop occurring (this is the reason that I return the cmp value from each of the swap() functions).

You end up with nicely sorted (albeit transposed) data like:

[0] = 00000003 03910392 04a504a6 05140515
[1] = 0000002b 03910393 04a604a7 05140516
[2] = 0000002b 03910393 04a7049f 05150514
[3] = 0000002c 03910394 04a704a8 05150516
[4] = 0000002c 03920393 04a704a8 05150516
[5] = 0000002d 03930394 04a804a9 05160515
[6] = 0003002b 03b903ba 04a9049f 05170516
[7] = 00200021 03b903bb 04a904aa 05170518
[8] = 00200022 03b903bb 04a904aa 05180517
[9] = 00210022 03b903bc 04aa04ab 05190518
...

AA-sort normally calls for a final pass that transposes the elements across the width of each vector rather than through each vector element but it wasn't necessary in my particular situation.

I took some performance statistics with the decrementer using a few of my data sets and attached the results below:


Time (ms)# Values
0.0366671254
0.008571171
0.017419684
0.1316541902
0.019900786
0.009424360
0.109599954
0.3475312415
0.3717792454


Pretty decent considering the original O(N^2) implementation was taking 3 ms for 2k+ values.