Optimizing for SIMD Floating-point Applications
5
5-25
Example 5-12 Division of Two Pair of Single-precision Complex Number
// Division of (ak + i bk ) / (ck + i dk )
movshdup xmm0, Src1; load imaginary parts into the
; destination, b1, b1, b0, b0
movaps xmm1, src2; load the 2nd pair of complex values,
; i.e. d1, c1, d0, c0
mulps xmm0, xmm1; temporary results, b1d1, b1c1, b0d0,
; b0c0
shufps xmm1, xmm1, b1; reorder the real and imaginary
; parts, c1, d1, c0, d0
movsldup xmm2, Src1; load the real parts into the
; destination, a1, a1, a0, a0
mulps xmm2, xmm1; temp results, a1c1, a1d1, a0c0, a0d0
addsubps xmm0, xmm2; a1c1+b1d1, b1c1-a1d1, a0c0+b0d0,
; b0c0-a0d0
mulps xmm1, xmm1; c1c1, d1d1, c0c0, d0d0
movps xmm2, xmm1; c1c1, d1d1, c0c0, d0d0
shufps xmm2, xmm2, b1; d1d1, c1c1, d0d0, c0c0
addps xmm2, xmm1; c1c1+d1d1, c1c1+d1d1, c0c0+d0d0,
; c0c0+d0d0
divps xmm0, xmm2
shufps xmm0, xmm0, b1 ; (b1c1-a1d1)/(c1c1+d1d1),
; (a1c1+b1d1)/(c1c1+d1d1),
; (b0c0-a0d0)/( c0c0+d0d0),
; (a0c0+b0d0)/( c0c0+d0d0)
Summary of Contents for ARCHITECTURE IA-32
Page 1: ...IA 32 Intel Architecture Optimization Reference Manual Order Number 248966 013US April 2006...
Page 220: ...IA 32 Intel Architecture Optimization 3 40...
Page 434: ...IA 32 Intel Architecture Optimization 9 20...
Page 514: ...IA 32 Intel Architecture Optimization B 60...
Page 536: ...IA 32 Intel Architecture Optimization C 22...