Accelerating Compute Intensive Functions Using C by Joe Hanson Listing One void fir(short *X, short *H, short *Y, int N, int T) { int n, t, acc; short *x, *h; /* Filter Input */ for (n = 0; n < N; n++) { x = X; h = H; acc = (*x--) * (*h++); for(t = 1; t < T; t++) { acc += (*x--) * (*h++); } *Y = acc >> 14; X++; Y++; } } Listing Two #include static se_sint<32> acc; /* Performs 8 parallel MAC */ SE_FUNC void firFunc(SE_INST FIR_MUL, SE_INST FIR_MAC, WR X, WR H, WR *Y) { se_sint<16> x, h se_sint<32> sum ; int i ; sum = 0; for(i = 0; i < 128; i += 16) { h = H(i + 15, i); x = X(127-i, 112-i); sum += x * h ; } acc = FIR_MUL ? sum : se_sint<32>(sum + acc) ; *Y = acc >> 14 ; } Listing Three #include "fir8.h" #define ST_DECR 1 #define ST_INCR 0 void fir(short *X, short *H, short *Y, short N, short T) { int n, t, t8; WR x, h, y; t8 = T/8; WRPUTINIT(ST_INCR, Y) ; for (n = 0; n < N; n++) { WRGET0INIT(ST_INCR, H) ; X++ ; WRGET1INIT(ST_DECR, X) ; WRGET0I( &h, 16 ); WRGET1I( &x, 16); FIR_MUL(x, h, &y); for (t = 1; t < t8; t++) { WRGET0I(&h, 16); WRGET1I(&x, 16); FIR_MAC(x, h, &y); } WRPUTI(y, 2) ; } WRPUTFLUSH0() ; WRPUTFLUSH1() ; } Listing Four /* Include the Stretch Instruction Specific Header */ #include "fir8.h" #define ST_DECR 1 /* Decrement Indicator */ #define ST_INCR 0 /* Increment Indicator */ /* define macro for the FIR ISEF instruction invocations */ #define FIR(H, X, h, x, t8, y) \ { \ int t8m1 = (t8)-1; \ WRGET0INIT(ST_INCR, (H)) ; \ (X)++ ; \ WRGET1INIT(ST_DECR, (X)) ; \ WRGET0I( &(h), 8 * sizeof(short) ); \ WRGET1I( &(x), 8 * sizeof(short) ); \ FIR_MUL( (x), (h), &(y) ); \ \ for (t = 1; t < (t8m1); t++) \ { \ WRGET0I( &(h), 16 ); \ WRGET1I( &(x), 16 ); \ FIR_MAC( (x), (h), &(y) ); \ } \ WRGET0I( &(h), 16 ); \ WRGET1I( &(x), 16 ); \ FIR_MAC( (x), (h), &(y) ); \ } /* * - FIR using 8 multipliers in ISEF * - Loop optimized */ void fir(short *X, short *H, short *Y, short N, short T) { int n, t, t8 ; WR x, h, y1, y2, y3, y4; t8 = T/8 ; WRPUTINIT(ST_INCR, Y) ; /* init output stream */ FIR (H, X, h, x, t8, y1) ; /* x * h + y => y1 */ /* loop ((N/2)-1) times */ n = 0; do { FIR (H, X, h, x, t8, y2) ; /* x * h + y => y2 */ WRPUTI(y1, 2) ; /* put (y1) result */ FIR (H, X, h, x, t8, y1) ; /* x * h + y => y1 */ WRPUTI(y2, 2) ; /* put (y2) result */ } while ( ++n < ((N>>1)-1) ); FIR (H, X, h, x, t8, y2) ; /* x * h + y => y2 */ WRPUTI(y1, 2) ; /* put (y1) result */ WRPUTI(y2, 2) ; /* put (y2) result */ WRPUTFLUSH0() ; /* flush output stream */ WRPUTFLUSH1() ; /* flush output stream */ } Listing Five /* Include the Stretch Instruction Specific Header */ #include "fir8.h" #define ST_DECR 1 /* Decrement Indicator */ #define ST_INCR 0 /* Increment Indicator */ #define FIR(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, X) \ { \ WRGET0I( &(h1), 8 * sizeof(short) ); \ WRGET1I( &(x1), 16 ); \ X++ ; \ WRGET0I( &(h2), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MUL( (x1), (h1), &(y1) ); \ \ WRGET0I( &(h3), 16 ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h2), &(y1) ); \ WRGET0I( &(h4), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h3), &(y1) ); \ WRGET0I( &(h5), 16 ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h4), &(y1) ); \ WRGET0I( &(h6), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h5), &(y1) ); \ WRGET0I( &(h7), 16 ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h6), &(y1) ); \ WRGET0I( &(h8), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h7), &(y1) ); \ WRGET1INIT(ST_DECR, X); \ FIR_MAC( (x2), (h8), &(y1) ); \ } #define FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) \ { \ WRGET1I( &(x1), 16 ); \ FIR_MUL( (x1), (h1), &(y2) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h2), &(y2) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h3), &(y2) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h4), &(y2) ); \ WRGET1I( &(x1), 16 ); \ X++ ; \ FIR_MAC( (x1), (h5), &(y2) ); \ WRGET1I( &(x1), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h6), &(y2) ); \ WRGET1I( &(x1), 16 ); \ WRGET1INIT0(ST_DECR, X); \ FIR_MAC( (x2), (h7), &(y2) ); \ WRGET1INIT1(); \ WRPUTI(y1, 2); \ FIR_MAC( (x1), (h8), &(y2) ); \ } #define FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) \ { \ WRGET1I( &(x1), 16 ); \ FIR_MUL( (x1), (h1), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h2), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h3), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1), (h4), &(y1) ); \ WRGET1I( &(x1), 16 ); \ X++ ; \ FIR_MAC( (x1), (h5), &(y1) ); \ WRGET1I( &(x1), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h6), &(y1) ); \ WRGET1I( &(x1), 16 ); \ WRGET1INIT0(ST_DECR, X) ; \ FIR_MAC( (x2), (h7), &(y1) ); \ WRGET1INIT1(); \ WRPUTI(y2, 2); \ FIR_MAC( (x1), (h8), &(y1) ); \ } /* * - FIR using 8 multipliers in ISEF * - Loop optimized / Hand unrolled */ void fir(short *X, short *H, short *Y, short N, short T) { int n, t, t8 ; WR h1, h2, h3, h4, h5, h6, h7, h8 ; WR x1, x2; WR y1; WR y2; // (these alternative "register" declarations make no difference:) // register WR y1 SE_REG("wra1") ; // register WR y2 SE_REG("wra2") ; WRPUTINIT(ST_INCR, Y); /* init output stream */ WRGET0INIT(ST_INCR, H); /* init coefficient stream */ X++ ; WRGET1INIT(ST_DECR, X); /* init input stream */ /* compute Y[0] in y1 */ FIR(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, X) ; /* loop ((N/2)-1) times */ for (n = 0; n < ((N>>1)-1); n++) { /* FIR1 writes previous output (y1) and computes current output (y2) */ FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ; /* FIR1 writes previous output (y2) and computes current output (y1) */ FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ; } /* compute Y[N-1] in y2 and write Y[N-2] from y1 */ FIR1(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ; WRPUTI(y2, 2) ; /* write U[N-1] */ WRPUTFLUSH0() ; /* flush output stream */ WRPUTFLUSH1() ; /* flush output stream */ } 1