AA-sort

A co-worker sent me a link to AA-sort some time ago, but I didn't have any reason to use it until just recently. I found myself in a situation where I have 2K~4K 4-byte elements packet into qwords and my initial O(N^2) solution just wasn't working out. So, I decided to try the O(N log N) "in-core" component of AA-sort as a pre-sort.

The paper linked above is relatively straight-forward, but they only provide pseudo-code so there's still a bit of effort required to get something usable. As per the author's goals, it's easily written as highly-vectorized, non-branching code and was rather fun to get working.

The authors were working with 8 16-bit values packed into a qword, and I had to make some simple changes to get it working with vec_uint4s.

First, my bitonic_sort() routine takes care of step one of their in-core algorithm by sorting the 4 uint32_t elements in a qword:

inline vec_uint4 bitonic_sort( vec_uint4 Vec_ )
{
   static const vec_uchar16 BADC = {
       0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
       0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
   };
   static const vec_uchar16 CDAB = {
       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
   };
   static const vec_uint4 mask1 = {
       0x00000000, 0xffffffff, 0xffffffff, 0x00000000
   };
   static const vec_uint4 mask2 = {
       0x00000000, 0x00000000, 0xffffffff, 0xffffffff
   };
   static const vec_uint4 mask3 = {
       0x00000000, 0xffffffff, 0x00000000, 0xffffffff
   };
   vec_uint4 temp, cmp, result;

   temp = spu_shuffle( Vec_, Vec_, BADC ); // Compare A to B and C to D
   cmp = spu_cmpgt( Vec_, temp );
   cmp = spu_xor( cmp, mask1 ); // Order A/B increasing, C/D decreasing
   result = spu_sel( Vec_, temp, cmp );

   temp = spu_shuffle( result, result, CDAB ); // Compare AB to CD
   cmp2 = spu_cmpgt( result, temp );
   cmp2 = spu_xor( cmp2, mask2 ); // Order AB/CD increasing
   result = spu_sel( result, temp, cmp2 );
   temp = spu_shuffle( result, result, BADC ); // Compare previous A to B and C to D
   cmp = spu_cmpgt( result, temp );
   cmp = spu_xor( cmp, mask3 ); // Order A/B and C/D increasing
   result = spu_sel( result, temp, cmp );

   return result;
}

Next come cmpswap() and cmpswap_skew() from the paper:

inline vec_uint4 vector_cmpswap( vec_uint4* A_, vec_uint4* B_ )
{
   const vec_uint4 a = *A_;
   const vec_uint4 b = *B_;
   const vec_uint4 cmp = spu_cmpgt( a, b );
   *A_ = spu_sel( a, b, cmp );
   *B_ = spu_sel( b, a, cmp );

   return cmp;
}

inline vec_uint4 vector_cmpswap_skew( vec_uint4* A_, vec_uint4* B_ )
{
   const vec_uint4 a = *A_;
   const vec_uint4 b = *B_;

   // The last word compared is 0xffff so that part of the mask should always be 0000
   static const vec_uchar16 BCDx = {
       0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
       0x0c, 0x0d, 0x0e, 0x0f, 0xc0, 0xc0, 0xc0, 0xc0
   };
   const vec_uint4 bShift = spu_shuffle( b, b, BCDx );
   const vec_uint4 cmp = spu_cmpgt( a, bShift );
   const vec_uint4 swap = spu_sel( a, bShift, cmp );
   const vec_uint4 bSwap = spu_sel( bShift, a, cmp );

   static const vec_uchar16 abcD = {
       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
       0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f
   };
   static const vec_uchar16 Aabc = {
       0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
       0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b
   };
   *A_ = spu_shuffle( a, swap, abcD );
   *B_ = spu_shuffle( b, bSwap, Aabc );

   return cmp;
}

In both functions I return the comparison mask because I think it's necessary to implement step 2 of AA-sort's in-core algorithm; the modified combsort:

static const float INV_SHRINK_FACTOR = 1.f/1.3f;
uint32_t gap = uint32_t(float(edgeCount) * INV_SHRINK_FACTOR);
while (gap > 1)
{
   // Straight comparisons
   const uint32_t maxSwapIndex = edgeCount - gap;
   for (uint32_t i = 0; i < maxSwapIndex; ++i)
   {
       vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap] );
   }

   // Skewed comparisons for when i+gap exceeds N/4
   for (uint32_t i = edgeCount - gap; i < edgeCount; ++i)
   {
       vector_cmpswap_skew( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap - edgeCount] );
   }

   gap = uint32_t((float)gap * INV_SHRINK_FACTOR);
}
vec_uint4 not_sorted;
do
{
   not_sorted = spu_splats((uint32_t)0);
   for (uint32_t i = 0; i < edgeCount - 1; ++i)
   {
       not_sorted = spu_or( not_sorted, vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+1] ) );
   }
   not_sorted = spu_or( not_sorted, vector_cmpswap_skew( (vec_uint4*)&work[edgeCount - 1], (vec_uint4*)&work[0] ) );
   not_sorted = spu_orx( not_sorted );
} while ( spu_extract(not_sorted, 0) );

I use a shrink factor of 1.3 as suggested by the authors as well the combsort page at Wikipedia. Standard combsort stops looping once gap is less-or-equal to 1 and no swaps have been performed in the current iteration. AA-sort splits the gap and "swap" loops so I use the not_sorted value to keep track of when swaps stop occurring (this is the reason that I return the cmp value from each of the swap() functions).

You end up with nicely sorted (albeit transposed) data like:

[0] = 00000003 03910392 04a504a6 05140515
[1] = 0000002b 03910393 04a604a7 05140516
[2] = 0000002b 03910393 04a7049f 05150514
[3] = 0000002c 03910394 04a704a8 05150516
[4] = 0000002c 03920393 04a704a8 05150516
[5] = 0000002d 03930394 04a804a9 05160515
[6] = 0003002b 03b903ba 04a9049f 05170516
[7] = 00200021 03b903bb 04a904aa 05170518
[8] = 00200022 03b903bb 04a904aa 05180517
[9] = 00210022 03b903bc 04aa04ab 05190518
...

AA-sort normally calls for a final pass that transposes the elements across the width of each vector rather than through each vector element but it wasn't necessary in my particular situation.

I took some performance statistics with the decrementer using a few of my data sets and attached the results below:

Time (ms)	# Values
0.036667	1254
0.008571	171
0.017419	684
0.131654	1902
0.019900	786
0.009424	360
0.109599	954
0.347531	2415
0.371779	2454

Pretty decent considering the original O(N^2) implementation was taking 3 ms for 2k+ values.

Rendered Obsolete