Home | CV | Publications | PhD | PostDoc | SIMD |
Android | Glass | Astronomy | Retro | Games | Lego |
This page contains supplemental material for the following book (since Intel Press is no longer in operation, I am making the vectorization book available in Kindle Edition on Amazon).
/* Code Samples from the Software Vectorization Handbook * By Aart J.C. Bik, (C) 2004. * https://www.aartbik.com/ * * This software is *not* in the public domain. However, it is freely available * without any fee for education, research, and non-profit purposes. */ // shorthand type definitions typedef unsigned char u8; typedef signed char s8; typedef unsigned short u16; typedef signed short s16; typedef unsigned int u32; typedef signed int s32; typedef unsigned long long u64; typedef signed long long s64; typedef float f32; typedef double f64;
// saturation, page 14 #define N 128 static u16 x[N], y[N], z[N]; void sat(int n) { int i; for (i = 0; i < n; i++) { int t = x[i] + y[i]; z[i] = (t <= 65535) ? t : 65535; } }
// floating-point precision, pages 75-76 #define N 128 float a[N]; void addSP() { int i; for (i = 0; i < 128; i++) { a[i] = a[i] + 3.14159f; } } void addDP() { int i; for (i = 0; i < 128; i++) { a[i] = a[i] + 3.14159; } }
// framework, page 78 #define N 200 u8 x[N]; void setx(s32 n) { s32 i; for (i = 0; i < n; i++) x[i] = 0; } void setx_where_n_is_16() { s32 i; for (i = 0; i < 16; i++) x[i] = 0; }
// mixed-type loop, page 79 #define N 200 s8 x[N]; f64 a[N], b[N]; void mixed() { int i; for (i = 0; i < N; i++) { a[i] += 4.0; x[i] &= 0x0f; b[i] *= 2.0; } }
// aligned and unaligned access patterns, page 81 u16 x[65]; void alignment() { int i; for (i = 0; i < 64; i++) { x[i] = x[i+1]; } }
// rotating read-only memory references, pages 81-82 u16 a[64]; static const u16 primes[] = { 2, 3, 5, 7, 11, 13, 17, 19 }; void rotate1() { int i; for (i = 0; i < 64; i++) { a[i] = primes[ (i+2) % 4 ]; } } void rotate2() { int i; for (i = 0; i < 64; i++) { a[i] = primes[ i % 8 ]; } }
// induction (T = u32, u8), pages 86-87 #define N 128 #define T u32 T v[N]; void induction(int n) { int i; u8 x = 253; for (i = 0; i < N; i++) { v[i] = (T) x; x++; } }
// wide induction variable, page 87 s64 w[100]; void wide_induction(s64 x) { int i; for (i = 0; i < 100; i++) { w[i] = x; x += 4; } }
// floating-point induction, page 88 f64 a[100], b; void inducDP(int n) { int i; b = 3.0; for (i = 0; i < 100; i++) { a[i] = b; b += 1.5; } }
// mixed-type induction, page 89 f32 c[100]; void mixed_induction() { int i; s32 g = 0x7fffff00; for (i = 0; i < 100; i++) { c[i] = (f32) g; g += 1; } }
// direct floating-point induction, page 89 f64 a[100]; void direct_inducFP() { int i; for (i = 0; i < 100; i++) { a[i] = (f64) i; } }
// bitwise-AND reduction, page 92 u32 msk[256]; u32 x; void mask() { int i; for (i = 0; i < 256; i++) { x &= msk[i]; } }
// scalar variable, pages 94-95 s32 x[64], y[64], z[64], s; void scalar() { int i; for (i = 0; i < 64; i++) { s = x[i]; y[i] = s - 1; z[i] = s + 17; } }
// multiplication, page 98 s16 u[256], v[256], w[256]; void mult() { int i; for (i = 0; i < 256; i++) { u[i] = v[i] * w[i]; } }
// division by 32, page 98 u64 x[256]; void div32() { int i; for (i = 0; i < 256; i++) { x[i] /= 32; } }
// min recognition, page 101 u8 u[256]; u8 min() { int i; u8 x = 0xff; for (i = 0; i < 256; i++) { if (u[i] < x) x = u[i]; } return x; }
// type conversions, page 102 s32 u[256], v[256]; u32 w[256]; void shifts() { int i; for (i = 0; i < 256; i++) { u[i] = (v[i] >> 2) + (w[i] >> 2); } }
// contrived type conversions that should vectorize, page 103 #define N 128 u8 x[N], y[N]; void contrived() { int i; for (i = 0; i < N; i++) { x[i] = ((u8) ((s32)((s8)((s32) y[i]))) ); } }
// type conversion, page 103 s16 x[128]; void conversion() { int i; for (i = 0; i < 128; i++) { x[i] = ((u8) x[i]); } }
// floating-point type conversion, page 104 f32 a[128]; f64 b[128]; f64 c; void conversionFP() { int i; for (i = 0; i < 128; i++) { a[i] += b[i] * c; } }
// float2double accumulation, page 105 f32 a[128]; f64 accumulate(void) { int i; f64 acc = 0; for (i = 0; i < 128; i++) { acc += a[i]; } return acc; }
// mixed-type conversion, page 106 f32 a[100], b[100]; s32 x[100]; void mixed_mult(int p) { int i; for (i = 0; i < 100; i++) { a[i] = b[i] * x[i]; } }
// math vectorization with SVML, page 108 f64 a[256], b[256], c[256]; void svml() { int i; for (i = 0; i < 256; i++) { a[i] = pow(a[i], b[i]) + cos(c[i]); } }
// math vectorization with SVML, page 108 f32 a[256], b[256]; void sin_and_add() { int i; for (i = 0; i < 256; i++) { a[i] = sin( b[i] ) + 1; } }
// bit-mask vectorization, page 111 // (pragma may be required to override efficiency heuristics) #define N 100 s32 x[N], y[N]; s32 z[N]; void masked_if1() { int i; #pragma vector always for (i = 0; i < N; i++) { if (x[i] == y[i]) z[i] = 0; } }
// bit-mask vectorization, page 113 s16 u[800]; void masked_if2() { int i; for (i = 0; i < 800; i++) { if (u[i] < 10) u[i] = 0; else u[i] = 1; } }
// guarded error conditions, page 114 #define N1 99 #define N2 100 #define N 300 s32 x[N1], y[N2], *p = NULL; void guarded_error(int flag) { int i; if (flag) { p = malloc(sizeof(s32) * N); } for (i = 0; i < N; i++) { if (i < N1) x[i] = 0; if (i < N2) y[i] = 0; if (flag) p[i] = 0; } }
// conditional scalar assignment, page 115 f64 a[128]; f64 acc = 0; void cond_scalar() { int i; acc = 0; for (i = 0; i < 128; i++) { if (a[i] < -1) { f64 t = a[i] + 1; acc += t * t - t; } } }
// 2-way conditional, page 116 f64 a[128], b[128]; void twoway_cond() { int i; for (i = 0; i < 128; i++) { f64 t; if (a[i] < 0) { t = -1.0; } else { t = 1.0; } b[i] = t; } }
// performance example, page 117 #define N 512 f32 a[N], b[N], c[N]; void divabc(s32 n) { s32 i; #pragma vector always for (i = 0; i < n; i++) { if (a[i] > 0) { a[i] = b[i] / c[i]; } } }
// cache line split optimizations, page 123 f32 a[40], b[41], c[42], d[43]; void cache_line_split() { int i; for (i = 0; i < 40; i++) { a[i] = b[i+1] * c[i+2] * d[i+3]; } }
// cache line split performance, page 126 #define N 256 s32 x[N], y[N+1]; void alignment_perf(int n) { int i; for (i = 0; i < n; i++) { x[i] = (x[i] + y[i+1]) >> 1; } }
// DDOT and SDOT, pages 136-137 f64 ddot(f64 *dx, f64 *dy, s32 n) { s32 i; f64 d = 0; for (i = 0; i < n; i++) { d += dx[i] * dy[i]; } return d; } f32 sdot(f32 *sx, f32 *sy, s32 n) { s32 i; f32 s = 0; for (i = 0; i < n; i++) { s += sx[i] * sy[i]; } return s; }
// DAXPY and SAXPY, page 142 void daxpy(f64 *dx, f64 *dy, f64 s, s32 n) { s32 i; for (i = 0; i < n; i++) { dy[i] += s * dx[i]; } } void saxpy(f32 *sx, f32 *sy, f32 s, s32 n) { s32 i; for (i = 0; i < n; i++) { sy[i] += s * sx[i]; } }
// conversion idiom, page 146 #define N 100 u8 x[N]; s16 y[N]; void conversion_idiom1(int n) { int i; for (i = 0; i < n; i++) { y[i] += x[i]; } }
// conversion idiom, page 147 #define N 100 f32 a[N]; s8 z[N]; void conversion_idiom2(int n) { int i; for (i = 0; i < n; i++) { a[i] = z[i]; } }
// arithmetic idiom, page 147 #define N 100 s16 x[N], y[N], z[N]; void arith_idiom1(int n) { int i; for (i = 0; i < n; i++) { x[i] = (y[i] * z[i]) >> 18; } }
// arithmetic idiom, page 148 s16 u[256]; s32 v[256]; void arith_idiom2(int n) { int i; for (i = 0; i < n; i++) { v[i] = u[i] * u[i]; } }
// arithmetic idiom, page 149 #define N 100 u8 x[N], y[N]; void arith_idiom3(int n) { int i; for (i = 0; i < n; i++) { x[i] = (x[i] + y[i] + 1) >> 1; } }
// reduction idiom, pages 149-150 #define N 100 u8 x[N], y[N]; s32 red; void reduction_idiom(int n) { int i; red = 0; for (i = 0; i < n; i++) { s32 temp = x[i] - y[i]; if (temp < 0) temp = -temp; red += temp; } }
// saturation idiom, pages 151 and 156 #define N 100 u16 x[N], y[N], z[N]; void sat_idiom1(int n) { int i; for (i = 0; i < n; i++) { s32 t = x[i] + y[i]; z[i] = (t <= 65535) ? t : 65535; } }
// saturation idiom, page 156 u8 x[256]; s16 y[256]; void sat_idiom2(int n) { int i; for (i = 0; i < 256; i++) { s32 t = y[i]; if (t < 0) t = 0; if (t > 255) t = 255; x[i] = t; } }
// saturation idiom, page 157 #define N 100 u8 u[N], v[N]; void sat_idiom3(int n) { int i; for (i = 0; i < n; i++) { s32 x = (u[i] < 200) ? u[i]+55:255; if (x > v[i]) v[i] = x; } }
// gzip flavored loop, page 159 #define N 256 u16 head[N]; void sat_idiom4(int n) { int i; for (i = 0; i < n; i++) { u32 m = head[i]; head[i] = (m >= 32768) ? m-32768 : 0; } }
// search loop, page 161 #define N 32 float a[N+1]; int search(float x) { int i; for (i = 0; i < N; i++) { if (a[i] == x) { return i; } } return -1; /* not found */ }
// loop materialization, page 177 f32 a[4]; f32 b[4]; f32 c[4]; void ordered() { a[0] = b[0] * c[0]; a[1] = b[1] * c[1]; a[2] = b[2] * c[2]; a[3] = b[3] * c[3]; } void unordered() { a[0] = b[0] * c[0]; a[3] = b[3] * c[3]; a[2] = b[2] * c[2]; a[1] = b[1] * c[1]; }
// loop materialization and collapsing, page 181 struct { f32 x, y; } a[100]; void reset_structs() { int i; for (i = 0; i < 100; i++) { a[i].x = 0; a[i].y = 0; } }
// loop materialization on local variables, page 183 s32 z[4]; void f(void) { s32 i, j, k, l; z[0] = i+8; z[2] = j+4; z[1] = k+9; z[3] = l+2; }
// loop materialization and collapsing, page 184 typedef struct { f32 x, y; } pair; typedef struct { pair e[2][2]; } matrix; void add(matrix *a, f32 s) { s32 i, j; for (i=0;i<2;i++) { for (j=0;j<2;j++){ a->e[i][j].x += s; a->e[i][j].y += s; } } }
// loop materialization and collapsing, page 189 void m4(f32 *dest, f32 *src, s32 n) { s32 i; for (i = 0; i < n; i++) { *(dest) = 0.5f * *(src); *(dest+1) = 0.2f * *(src+1); *(dest+2) = 0.1f * *(src+2); *(dest+3) = 0.4f * *(src+3); src += 4; dest += 4; } }
// This example was used at page 195 to demonstrate compiler diagnostics. // Make sure comments match the line numbers in the file. /* 01 */ #define N 32 /* 02 */ float a[N], b[N], c[N], d[N]; /* 03 */ /* 04 */ doit() { /* 05 */ int i; /* 06 */ for (i = 0; i < N/2; i++) a[i] = i; /* 07 */ for (i = 1; i < N; i++) { /* 08 */ b[i] = b[i-1] + 2; /* data dependence cycle */ /* 09 */ c[i] = 1; /* 10 */ } /* 11 */ d[0] = 10; /* 12 */ d[1] = 10; /* 13 */ d[2] = 10; /* 14 */ d[3] = 10; /* 15 */}