The intention of this appendix is to show how to factor a mathematical calculation into architecture-independent and architecture-specific parts. Using matrix multiplication as an example, you’ll see how to write a function that works for both the PowerPC and the x86 architectures with a minimum of architecture-specific coding. You can then apply this approach to other, more complex mathematical calculations.
The following basic operations are available on both architectures:
Vector loads and stores
Multiplication
Addition
An instruction to splat a float across a vector
For other types of calculations, you may need to write separate versions of code. Because of the differences in the number of registers and the pipeline depths between the two architectures, it is often advantageous to provide separate versions.
Note: There is a function for 4x4 matrix multiplication in the Accelerate framework (vecLib
) that is tuned for both architectures. You can also call sgemm
from Basic Linear Algebra Subprograms (BLAS) (also available in the Accelerate framework) to operate on larger matrices.
Architecture-Specific Code
Architecture-Independent Matrix Multiplication
Listing B-1 shows the architecture-specific code you need to support matrix multiplication. The code calls the architecture-independent function MyMatrixMultiply
, which is shown in Listing B-2. The code shown in Listing B-1 works properly for both instruction set architectures only if you build the code as a universal binary. For more information, see “Building a Universal Binary.”
Note: The sample code makes use of a GCC extension to return a result from a code block ({}). The code may not compile correctly on other compilers. The extension is necessary because you cannot pass immediate values to an inline function, meaning that you must use a macro.
Listing B-1 Architecture-specific code needed to support matrix multiplication
#include <stdlib.h> |
#include <stdio.h> |
#include <math.h> |
// For each vector architecture... |
#if defined( __VEC__ ) |
// AltiVec |
// Set up a vector type for a float[4] array for each vector type |
typedef vector float vFloat; |
// Define some macros to map a virtual SIMD language to |
// each actual SIMD language. For matrix multiplication, the tasks |
// you need to perform are essentially the same between the two |
// instruction set architectures (ISA). |
#define vSplat( v, i ) ({ vFloat z = vec_splat( v, i ); |
/* return */ z; }) |
#define vMADD vec_madd |
#define vLoad( ptr ) vec_ld( 0, ptr ) |
#define vStore( v, ptr ) vec_st( v, 0, ptr ) |
#define vZero() (vector float) vec_splat_u32(0) |
#elif defined( __SSE__ ) |
// SSE |
// The header file xmmintrin.h defines C functions for using |
// SSE and SSE2 according to the Intel C programming interface |
#include <xmmintrin.h> |
// Set up a vector type for a float[4] array for each vector type |
typedef __m128 vFloat; |
// Also define some macros to map a virtual SIMD language to |
// each actual SIMD language. |
// Note that because i MUST be an immediate, it is incorrect here |
// to alias i to a stack based copy and replicate that 4 times. |
#define vSplat( v, i )({ __m128 a = v; a = _mm_shuffle_ps( a, a, \ |
_MM_SHUFFLE(i,i,i,i) ); /* return */ a; }) |
inline __m128 vMADD( __m128 a, __m128 b, __m128 c ) |
{ |
return _mm_add_ps( c, _mm_mul_ps( a, b ) ); |
} |
#define vLoad( ptr ) _mm_load_ps( (float*) (ptr) ) |
#define vStore( v, ptr ) _mm_store_ps( (float*) (ptr), v ) |
#define vZero() _mm_setzero_ps() |
#else |
// Scalar |
#warning To compile vector code, you must specify -faltivec, |
-msse, or both- faltivec and -msse |
#warning Compiling for scalar code. |
// Some scalar equivalents to show what the above vector |
// versions accomplish |
// A vector, declared as a struct with 4 scalars |
typedef struct |
{ |
float a; |
float b; |
float c; |
float d; |
}vFloat; |
// Splat element i across the whole vector and return it |
#define vSplat( v, i ) ({ vFloat z; z.a = z.b = z.c = z.d = ((float*) |
&v)[i]; /* return */ z; }) |
// Perform a fused-multiply-add operation on architectures that support it |
// result = X * Y + Z |
inline vFloat vMADD( vFloat X, vFloat Y, vFloat Z ) |
{ |
vFloat result; |
result.a = X.a * Y.a + Z.a; |
result.b = X.b * Y.b + Z.b; |
result.c = X.c * Y.c + Z.c; |
result.d = X.d * Y.d + Z.d; |
return result; |
} |
// Return a vector that starts at the given address |
#define vLoad( ptr ) ( (vFloat*) ptr )[0] |
// Write a vector to the given address |
#define vStore( v, ptr ) ( (vFloat*) ptr )[0] = v |
// Return a vector full of zeros |
#define vZero() ({ vFloat z; z.a = z.b = z.c = z. |
d = 0.0f; /* return */ z; }) |
#endif |
// Prototype for a vector matrix multiply function |
void MyMatrixMultiply( vFloat A[4], vFloat B[4], vFloat C[4] ); |
int main( void ) |
{ |
// The vFloat type (defined previously) is a vector or scalar array |
// that contains 4 floats |
// Thus each one of these is a 4x4 matrix, stored in the C storage order. |
vFloat A[4]; |
vFloat B[4]; |
vFloat C1[4]; |
vFloat C2[4]; |
int i, j, k; |
// Pointers to the elements in A, B, C1 and C2 |
float *a = (float*) &A; |
float *b = (float*) &B; |
float *c1 = (float*) &C1; |
float *c2 = (float*) &C2; |
// Initialize the data |
for( i = 0; i < 16; i++ ) |
{ |
a[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX ); |
b[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX ); |
c1[i] = c2[i] = 0.0; |
} |
// Perform the brute-force version of matrix multiplication |
// and use this later to check for correctness |
printf( "Doing simple matrix multiply...\n" ); |
for( i = 0; i < 4; i++ ) |
for( j = 0; j < 4; j++ ) |
{ |
float result = 0.0f; |
for( k = 0; k < 4; k++ ) |
result += a[ i * 4 + k] * b[ k * 4 + j ]; |
c1[ i * 4 + j ] = result; |
} |
// The vector version |
printf( "Doing vector matrix multiply...\n" ); |
MyMatrixMultiply( A, B, C2 ); |
// Make sure that the results are correct |
// Allow for some rounding error here |
printf( "Verifying results..." ); |
for( i = 0 ; i < 16; i++ ) |
if( fabs( c1[i] - c2[i] ) > 1e-6 ) |
printf( "failed at %i,%i: %8.17g %8.17g\n", i/4, |
i&3, c1[i], c2[i] ); |
printf( "done.\n" ); |
return 0; |
} |
The 4x4 matrix multiplication algorithm shown in Listing B-2 is a simple matrix multiplication algorithm performed with four columns in parallel. The basic calculation is as follows:
C[i][j] = sum( A[i][k] * B[k][j], k = 0... width of A )
It can be rewritten in mathematical vector notation for rows of C as the following:
C[i][] = sum( A[i][k] * B[k][], k = 0... width of A )
Where:
C[i][]
is the ith row of C
A[i][k]
is the element of A at row i and column k
B[k][]
is the kth row of B
An example calculation for C[0][]
is as follows:
C[0][] = A[0][0] * B[0][] + A[0][1] * B[1][] + A[0][2] * B[2][] + A[0][3] * B[3][]
This calculation is simply a multiplication of a scalar times a vector, followed by addition of similar elements between two vectors, repeated four times, to get a vector that contains four sums of products. Performing the calculation in this way saves you from transposing B to obtain the B columns, and also saves you from adding across vectors, which is inefficient. All operations occur between similar elements of two different vectors.
Listing B-2 shows architecture-independent vector code that performs matrix multiplication. This code compiles as scalar if you do not set up the appropriate compiler flags for PowerPC (-faltivec
) or x86 (-msse
), or if AltiVec is unavailable on the PowerPC. The matrices used in the MyMatrixMultply
function assume the C storage order for 2D arrays, not the FORTRAN storage order.
Listing B-2 Architecture-independent code that performs matrix multiplication
void MyMatrixMultiply( vFloat A[4], vFloat B[4], vFloat C[4] ) |
{ |
vFloat A1 = vLoad( A ); //Row 1 of A |
vFloat A2 = vLoad( A + 1 ); //Row 2 of A |
vFloat A3 = vLoad( A + 2 ); //Row 3 of A |
vFloat A4 = vLoad( A + 3); //Row 4 of A |
vFloat C1 = vZero(); //Row 1 of C, initialized to zero |
vFloat C2 = vZero(); //Row 2 of C, initialized to zero |
vFloat C3 = vZero(); //Row 3 of C, initialized to zero |
vFloat C4 = vZero(); //Row 4 of C, initialized to zero |
vFloat B1 = vLoad( B ); //Row 1 of B |
vFloat B2 = vLoad( B + 1 ); //Row 2 of B |
vFloat B3 = vLoad( B + 2 ); //Row 3 of B |
vFloat B4 = vLoad( B + 3); //Row 4 of B |
//Multiply the first row of B by the first column of A (do not sum across) |
C1 = vMADD( vSplat( A1, 0 ), B1, C1 ); |
C2 = vMADD( vSplat( A2, 0 ), B1, C2 ); |
C3 = vMADD( vSplat( A3, 0 ), B1, C3 ); |
C4 = vMADD( vSplat( A4, 0 ), B1, C4 ); |
// Multiply the second row of B by the second column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 1 ), B2, C1 ); |
C2 = vMADD( vSplat( A2, 1 ), B2, C2 ); |
C3 = vMADD( vSplat( A3, 1 ), B2, C3 ); |
C4 = vMADD( vSplat( A4, 1 ), B2, C4 ); |
// Multiply the third row of B by the third column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 2 ), B3, C1 ); |
C2 = vMADD( vSplat( A2, 2 ), B3, C2 ); |
C3 = vMADD( vSplat( A3, 2 ), B3, C3 ); |
C4 = vMADD( vSplat( A4, 2 ), B3, C4 ); |
// Multiply the fourth row of B by the fourth column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 3 ), B4, C1 ); |
C2 = vMADD( vSplat( A2, 3 ), B4, C2 ); |
C3 = vMADD( vSplat( A3, 3 ), B4, C3 ); |
C4 = vMADD( vSplat( A4, 3 ), B4, C4 ); |
// Write out the result to the destination |
vStore( C1, C ); |
vStore( C2, C + 1 ); |
vStore( C3, C + 2 ); |
vStore( C4, C + 3 ); |
} |
© 2005, 2009 Apple Inc. All Rights Reserved. (Last updated: 2009-02-04)