Marvee Amasi
Marvee Amasi
DIIDevHeads IoT Integration Server
Created by Marvee Amasi on 7/30/2024 in #middleware-and-os
How to optimize SIMD instructions for double precision floating point operations on Intel Core i7
I want to optimize a computationally intensive loop using SIMD instructions on an Intel Core i7 12700K processor and 32GB of DDR4 3200 memory , to boost the performance for a double precision floating point vector addition operation within a larger scientific computation
section .data
data_array: dq 1.0, 2.0, 3.0, 4.0, ..., 1000000.0 ; Array of 1 million double-precision values

section .text
global my_function

my_function:
mov rcx, 1000000 / 4 ; Loop counter (number of 128-bit chunks)
mov rsi, data_array

loop_start:
movups xmm0, [rsi]
movups xmm1, [rsi + 16]
addps xmm0, xmm1
movups [rsi], xmm0
add rsi, 32
dec rcx
jnz loop_start
ret
section .data
data_array: dq 1.0, 2.0, 3.0, 4.0, ..., 1000000.0 ; Array of 1 million double-precision values

section .text
global my_function

my_function:
mov rcx, 1000000 / 4 ; Loop counter (number of 128-bit chunks)
mov rsi, data_array

loop_start:
movups xmm0, [rsi]
movups xmm1, [rsi + 16]
addps xmm0, xmm1
movups [rsi], xmm0
add rsi, 32
dec rcx
jnz loop_start
ret
2 replies