I didn't have time to do everything that I wanted, but I did verify that for 64-bit code, and using compiler Version 1.05.0 (01-31-2016), built for win64 (64bit), naked functions conform to the 64-bit calling convention.
Edit: Added code to do RIP-relative access to shared variables.
Edit2: Sorry, the above was a last minute change, and the shared variables are accessed as direct memory operands where the address of the variable is encoded into the accessing instruction. There are examples of RIP-relative addressing elsewhere in the assembly code output of the compiler. IIRC RIP-relative addressing is preferred because the encoding is smaller.
Code: Select all
''----------------------------------------------------------------------
'' The first four integer/floating-point arguments, taken in left to
'' right order*, should be passed in RCX/XMM0L**, RDX/XMM1L**,
'' R8/XMM2L**, and R9/XMM3L**, with any further arguments, taken in
'' right to left order*, passed on the stack.
''
'' * As they are listed in the function definition or prototype.
''
'' ** The choice of register is determined by the operand type, with
'' the register that does not match the type ignored.
''
'' Scalar values that fit in 64 bits are returned in RAX.
''
'' Floating-point values are returned in XMM0.
''
''----------------------------------------------------------------------
''----------------------------------------------------------------------
'' On entry to our functions, the stack layout is:
'' rsp+48 arg6
'' rsp+40 arg5
'' rsp+32 arg4 spill
'' rsp+24 arg3 spill
'' rsp+16 arg2 spill
'' rsp+8 arg1 spill
'' rsp return address
''----------------------------------------------------------------------
function Test1 naked ( arg1 as integer, _
arg2 as integer, _
arg3 as integer, _
arg4 as integer, _
arg5 as integer, _
arg6 as integer ) as integer
asm
xor rax, rax
add rax, rcx
add rax, rdx
add rax, r8
add rax, r9
add rax, [rsp+40]
add rax, [rsp+48]
ret
end asm
end function
''----------------------------------------------------------------------
function Test2 naked ( arg1 as double, _
arg2 as double, _
arg3 as double, _
arg4 as double, _
arg5 as double, _
arg6 as double ) as double
asm
addsd xmm0, xmm1
addsd xmm0, xmm2
addsd xmm0, xmm3
addsd xmm0, [rsp+40]
addsd xmm0, [rsp+48]
ret
end asm
end function
''----------------------------------------------------------------------
dim shared as integer a = 1, b = 2, c = 3
function Test3 naked ( ) as integer
asm
xor rax, rax
add rax, a
add rax, b
add rax, c
ret
end asm
end function
''----------------------------------------------------------------------
print Test1(1,2,3,4,5,6)
print Test2(1,2,3,4,5,6)
print Test3()
sleep
Code: Select all
.file "Test.c"
.intel_syntax noprefix
.data
.align 8
A$:
.quad 1
.align 8
B$:
.quad 2
.align 8
C$:
.quad 3
/APP
.text
.globl TEST1
TEST1:
xor rax, rax
add rax, rcx
add rax, rdx
add rax, r8
add rax, r9
add rax, [rsp+40]
add rax, [rsp+48]
ret
.text
.globl TEST2
TEST2:
addsd xmm0, xmm1
addsd xmm0, xmm2
addsd xmm0, xmm3
addsd xmm0, [rsp+40]
addsd xmm0, [rsp+48]
ret
.text
.globl TEST3
TEST3:
xor rax, rax
add rax, A$
add rax, B$
add rax, C$
ret
.def __main; .scl 2; .type 32; .endef
/NO_APP
.text
.globl main
.def main; .scl 2; .type 32; .endef
main:
push rbp
mov rbp, rsp
sub rsp, 80
mov DWORD PTR 16[rbp], ecx
mov QWORD PTR 24[rbp], rdx
call __main
mov DWORD PTR -28[rbp], 0
mov rax, QWORD PTR 24[rbp]
mov r8d, 0
mov rdx, rax
mov ecx, DWORD PTR 16[rbp]
call fb_Init
.L2:
mov QWORD PTR 40[rsp], 6
mov QWORD PTR 32[rsp], 5
mov r9d, 4
mov r8d, 3
mov edx, 2
mov ecx, 1
call TEST1
mov QWORD PTR -8[rbp], rax
mov rax, QWORD PTR -8[rbp]
mov r8d, 1
mov rdx, rax
mov ecx, 0
call fb_PrintLongint
movsd xmm3, QWORD PTR .LC0[rip]
movsd xmm2, QWORD PTR .LC1[rip]
movsd xmm1, QWORD PTR .LC2[rip]
movsd xmm0, QWORD PTR .LC3[rip]
movsd QWORD PTR 40[rsp], xmm0
movsd xmm0, QWORD PTR .LC4[rip]
movsd QWORD PTR 32[rsp], xmm0
movsd xmm0, QWORD PTR .LC5[rip]
call TEST2
movq rax, xmm0
mov QWORD PTR -16[rbp], rax
movsd xmm0, QWORD PTR -16[rbp]
mov r8d, 1
movapd xmm1, xmm0
mov ecx, 0
call fb_PrintDouble
call TEST3
mov QWORD PTR -24[rbp], rax
mov rax, QWORD PTR -24[rbp]
mov r8d, 1
mov rdx, rax
mov ecx, 0
call fb_PrintLongint
mov ecx, -1
call fb_Sleep
.L3:
mov ecx, 0
call fb_End
mov eax, DWORD PTR -28[rbp]
leave
ret
.section .rdata,"dr"
.align 8
.LC0:
.long 0
.long 1074790400
.align 8
.LC1:
.long 0
.long 1074266112
.align 8
.LC2:
.long 0
.long 1073741824
.align 8
.LC3:
.long 0
.long 1075314688
.align 8
.LC4:
.long 0
.long 1075052544
.align 8
.LC5:
.long 0
.long 1072693248
.ident "GCC: (x86_64-win32-sjlj-rev0, Built by MinGW-W64 project) 5.2.0"
.def fb_Init; .scl 2; .type 32; .endef
.def TEST1; .scl 2; .type 32; .endef
.def fb_PrintLongint; .scl 2; .type 32; .endef
.def TEST2; .scl 2; .type 32; .endef
.def fb_PrintDouble; .scl 2; .type 32; .endef
.def TEST3; .scl 2; .type 32; .endef
.def fb_Sleep; .scl 2; .type 32; .endef
.def fb_End; .scl 2; .type 32; .endef
Regarding the problem with code that runs OK with no compiler optimization, but fails with optimization, within my experience the problem is usually a failure to follow the calling convention. For example, I recently created a set of 64-bit clock-cycle count macros for GCC that use inline assembly. As is the norm for cycle-count code, the macros use CPUID as a "serializing" instruction. One unfortunate side effect of CPUID is that it modifies the EBX component of the callee-save register RBX. Since preserving RBX around the CPUID instruction would place a POP RBX instruction after the CPUID instruction, "polluting" the cycle count somewhat, I avoided preserving RBX. The code worked fine with no compiler optimizations, but with any level of optimization, it would trigger exceptions, apparently because the optimized code depended on RBX being preserved, as per the calling convention. While compiling with no optimization would correct the immediate problem, it is not overly practical because code compiled with no optimization is effectively optimized for debugging, and generally executes much, much slower than optimized code.
There is a Microsoft calling-convention reference
here, and a more compact one
here.