M
Modular3mo ago
Mihai

Simd size greater than max supported by processor

What happens if I create a SIMD value with a large size - something like 2^10 or larger ? Will there be multiple instruction with the max size permitted by the processor ? Wondering what is the best practice when working with large arrays but still want SIMD behaviour .
3 Replies
Helehex
Helehex3mo ago
It will be very slow. The general practice when you want something like a simd, but with arbitrary size, is to make something like a heap allocated struct Vector[type: DType] with arithmetic semantics. You can use vectorize to implement the operations.
vectorize | Modular Docs
vectorizefunc Int, /, *, unrollfactor Int)
Helehex
Helehex3mo ago
Or, you can use one of the libraries that already implements a similar type.
Darkmatter
Darkmatter3mo ago
fn sum[T: DType, width: Int](a: SIMD[T, width], b: SIMD[T, width]) -> SIMD[T, width]:
return a + b
fn sum[T: DType, width: Int](a: SIMD[T, width], b: SIMD[T, width]) -> SIMD[T, width]:
return a + b
If I take the following function and compile it with T = DType.float32 and width = 256, I get the following assembly on my AVX-512 processor:
; Function Preamble
pushq %rbp
movq %rsp, %rbp
andq $-64, %rsp
subq $64, %rsp
; Move stack-allocated values into registers
vmovaps 464(%rbp), %zmm8
vmovaps 400(%rbp), %zmm9
vmovaps 336(%rbp), %zmm10
vmovaps 272(%rbp), %zmm11
vmovaps 208(%rbp), %zmm12
vmovaps 144(%rbp), %zmm13
vmovaps 80(%rbp), %zmm14
vmovaps 16(%rbp), %zmm15
; Do the adds
vaddps 528(%rbp), %zmm0, %zmm0
vaddps 592(%rbp), %zmm1, %zmm1
vaddps 656(%rbp), %zmm2, %zmm2
vaddps 720(%rbp), %zmm3, %zmm3
vaddps 784(%rbp), %zmm4, %zmm4
vaddps 848(%rbp), %zmm5, %zmm5
vaddps 912(%rbp), %zmm6, %zmm6
vaddps 976(%rbp), %zmm7, %zmm7
; This instruction is pipelined alongside the others, so it can happen whenever during the adds
movq %rdi, %rax
vaddps 1488(%rbp), %zmm8, %zmm8
vaddps 1040(%rbp), %zmm15, %zmm15
vaddps 1104(%rbp), %zmm14, %zmm14
vaddps 1168(%rbp), %zmm13, %zmm13
vaddps 1232(%rbp), %zmm12, %zmm12
vaddps 1296(%rbp), %zmm11, %zmm11
vaddps 1360(%rbp), %zmm10, %zmm10
vaddps 1424(%rbp), %zmm9, %zmm9
; Return the required values to the stack (thanks calling convention)
vmovaps %zmm8, 960(%rdi)
vmovaps %zmm9, 896(%rdi)
vmovaps %zmm10, 832(%rdi)
vmovaps %zmm11, 768(%rdi)
vmovaps %zmm12, 704(%rdi)
vmovaps %zmm13, 640(%rdi)
vmovaps %zmm14, 576(%rdi)
vmovaps %zmm15, 512(%rdi)
vmovaps %zmm7, 448(%rdi)
vmovaps %zmm6, 384(%rdi)
vmovaps %zmm5, 320(%rdi)
vmovaps %zmm4, 256(%rdi)
vmovaps %zmm3, 192(%rdi)
vmovaps %zmm2, 128(%rdi)
vmovaps %zmm1, 64(%rdi)
vmovaps %zmm0, (%rdi)
; Function exit code
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
; Function Preamble
pushq %rbp
movq %rsp, %rbp
andq $-64, %rsp
subq $64, %rsp
; Move stack-allocated values into registers
vmovaps 464(%rbp), %zmm8
vmovaps 400(%rbp), %zmm9
vmovaps 336(%rbp), %zmm10
vmovaps 272(%rbp), %zmm11
vmovaps 208(%rbp), %zmm12
vmovaps 144(%rbp), %zmm13
vmovaps 80(%rbp), %zmm14
vmovaps 16(%rbp), %zmm15
; Do the adds
vaddps 528(%rbp), %zmm0, %zmm0
vaddps 592(%rbp), %zmm1, %zmm1
vaddps 656(%rbp), %zmm2, %zmm2
vaddps 720(%rbp), %zmm3, %zmm3
vaddps 784(%rbp), %zmm4, %zmm4
vaddps 848(%rbp), %zmm5, %zmm5
vaddps 912(%rbp), %zmm6, %zmm6
vaddps 976(%rbp), %zmm7, %zmm7
; This instruction is pipelined alongside the others, so it can happen whenever during the adds
movq %rdi, %rax
vaddps 1488(%rbp), %zmm8, %zmm8
vaddps 1040(%rbp), %zmm15, %zmm15
vaddps 1104(%rbp), %zmm14, %zmm14
vaddps 1168(%rbp), %zmm13, %zmm13
vaddps 1232(%rbp), %zmm12, %zmm12
vaddps 1296(%rbp), %zmm11, %zmm11
vaddps 1360(%rbp), %zmm10, %zmm10
vaddps 1424(%rbp), %zmm9, %zmm9
; Return the required values to the stack (thanks calling convention)
vmovaps %zmm8, 960(%rdi)
vmovaps %zmm9, 896(%rdi)
vmovaps %zmm10, 832(%rdi)
vmovaps %zmm11, 768(%rdi)
vmovaps %zmm12, 704(%rdi)
vmovaps %zmm13, 640(%rdi)
vmovaps %zmm14, 576(%rdi)
vmovaps %zmm15, 512(%rdi)
vmovaps %zmm7, 448(%rdi)
vmovaps %zmm6, 384(%rdi)
vmovaps %zmm5, 320(%rdi)
vmovaps %zmm4, 256(%rdi)
vmovaps %zmm3, 192(%rdi)
vmovaps %zmm2, 128(%rdi)
vmovaps %zmm1, 64(%rdi)
vmovaps %zmm0, (%rdi)
; Function exit code
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
I'm actually not sure why this is using the stack, I'm going to start up a discussion in #performance-and-benchmarks because that feels like a bug.
Want results from more Discord servers?
Add your server