The paper linked above is relatively straight-forward, but they only provide pseudo-code so there's still a bit of effort required to get something usable. As per the author's goals, it's easily written as highly-vectorized, non-branching code and was rather fun to get working.

The authors were working with 8 16-bit values packed into a qword, and I had to make some simple changes to get it working with vec_uint4s.

First, my bitonic_sort() routine takes care of step one of their in-core algorithm by sorting the 4 uint32_t elements in a qword:

inline vec_uint4 bitonic_sort( vec_uint4 Vec_ )

{

static const vec_uchar16 BADC = {

0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,

0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b

};

static const vec_uchar16 CDAB = {

0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07

};

static const vec_uint4 mask1 = {

0x00000000, 0xffffffff, 0xffffffff, 0x00000000

};

static const vec_uint4 mask2 = {

0x00000000, 0x00000000, 0xffffffff, 0xffffffff

};

static const vec_uint4 mask3 = {

0x00000000, 0xffffffff, 0x00000000, 0xffffffff

};

vec_uint4 temp, cmp, result;

temp = spu_shuffle( Vec_, Vec_, BADC ); // Compare A to B and C to D

cmp = spu_cmpgt( Vec_, temp );

cmp = spu_xor( cmp, mask1 ); // Order A/B increasing, C/D decreasing

result = spu_sel( Vec_, temp, cmp );

temp = spu_shuffle( result, result, CDAB ); // Compare AB to CD

cmp2 = spu_cmpgt( result, temp );

cmp2 = spu_xor( cmp2, mask2 ); // Order AB/CD increasing

result = spu_sel( result, temp, cmp2 );

temp = spu_shuffle( result, result, BADC ); // Compare previous A to B and C to D

cmp = spu_cmpgt( result, temp );

cmp = spu_xor( cmp, mask3 ); // Order A/B and C/D increasing

result = spu_sel( result, temp, cmp );

return result;

}

Next come cmpswap() and cmpswap_skew() from the paper:

inline vec_uint4 vector_cmpswap( vec_uint4* A_, vec_uint4* B_ )

{

const vec_uint4 a = *A_;

const vec_uint4 b = *B_;

const vec_uint4 cmp = spu_cmpgt( a, b );

*A_ = spu_sel( a, b, cmp );

*B_ = spu_sel( b, a, cmp );

return cmp;

}

inline vec_uint4 vector_cmpswap_skew( vec_uint4* A_, vec_uint4* B_ )

{

const vec_uint4 a = *A_;

const vec_uint4 b = *B_;

// The last word compared is 0xffff so that part of the mask should always be 0000

static const vec_uchar16 BCDx = {

0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,

0x0c, 0x0d, 0x0e, 0x0f, 0xc0, 0xc0, 0xc0, 0xc0

};

const vec_uint4 bShift = spu_shuffle( b, b, BCDx );

const vec_uint4 cmp = spu_cmpgt( a, bShift );

const vec_uint4 swap = spu_sel( a, bShift, cmp );

const vec_uint4 bSwap = spu_sel( bShift, a, cmp );

static const vec_uchar16 abcD = {

0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f

};

static const vec_uchar16 Aabc = {

0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,

0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b

};

*A_ = spu_shuffle( a, swap, abcD );

*B_ = spu_shuffle( b, bSwap, Aabc );

return cmp;

}

In both functions I return the comparison mask because I think it's necessary to implement step 2 of AA-sort's in-core algorithm; the modified combsort:

static const float INV_SHRINK_FACTOR = 1.f/1.3f;

uint32_t gap = uint32_t(float(edgeCount) * INV_SHRINK_FACTOR);

while (gap > 1)

{

// Straight comparisons

const uint32_t maxSwapIndex = edgeCount - gap;

for (uint32_t i = 0; i < maxSwapIndex; ++i)

{

vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap] );

}

// Skewed comparisons for when i+gap exceeds N/4

for (uint32_t i = edgeCount - gap; i < edgeCount; ++i)

{

vector_cmpswap_skew( (vec_uint4*)&work[i], (vec_uint4*)&work[i+gap - edgeCount] );

}

gap = uint32_t((float)gap * INV_SHRINK_FACTOR);

}

vec_uint4 not_sorted;

do

{

not_sorted = spu_splats((uint32_t)0);

for (uint32_t i = 0; i < edgeCount - 1; ++i)

{

not_sorted = spu_or( not_sorted, vector_cmpswap( (vec_uint4*)&work[i], (vec_uint4*)&work[i+1] ) );

}

not_sorted = spu_or( not_sorted, vector_cmpswap_skew( (vec_uint4*)&work[edgeCount - 1], (vec_uint4*)&work[0] ) );

not_sorted = spu_orx( not_sorted );

} while ( spu_extract(not_sorted, 0) );

I use a shrink factor of 1.3 as suggested by the authors as well the combsort page at Wikipedia. Standard combsort stops looping once

*gap*is less-or-equal to 1 and no swaps have been performed in the current iteration. AA-sort splits the

*gap*and "swap" loops so I use the

*not_sorted*value to keep track of when swaps stop occurring (this is the reason that I return the

*cmp*value from each of the swap() functions).

You end up with nicely sorted (albeit transposed) data like:

[0] = 00000003 03910392 04a504a6 05140515

[1] = 0000002b 03910393 04a604a7 05140516

[2] = 0000002b 03910393 04a7049f 05150514

[3] = 0000002c 03910394 04a704a8 05150516

[4] = 0000002c 03920393 04a704a8 05150516

[5] = 0000002d 03930394 04a804a9 05160515

[6] = 0003002b 03b903ba 04a9049f 05170516

[7] = 00200021 03b903bb 04a904aa 05170518

[8] = 00200022 03b903bb 04a904aa 05180517

[9] = 00210022 03b903bc 04aa04ab 05190518

...

AA-sort normally calls for a final pass that transposes the elements across the width of each vector rather than through each vector element but it wasn't necessary in my particular situation.

I took some performance statistics with the decrementer using a few of my data sets and attached the results below:

Time (ms) | # Values |
---|---|

0.036667 | 1254 |

0.008571 | 171 |

0.017419 | 684 |

0.131654 | 1902 |

0.019900 | 786 |

0.009424 | 360 |

0.109599 | 954 |

0.347531 | 2415 |

0.371779 | 2454 |

Pretty decent considering the original O(N^2) implementation was taking 3 ms for 2k+ values.