Page under construction
General statements:
Compiler: GNU GCC 8.1
Flags: -O1 --save-temps and -funroll-loops for FORTRAN
The C++ Point2D Type is simply a struct inherit from a SIMDvalarray to use the benefits of SSE, AVX.
One can not use something like a Point2D type in FORTRAN. One can program it, but not use. Because it is not possible to inline small function from other modules in FORTRAN. To pay a function call every time is not an option. Hence I use a variable size array for the two dimensions of Point2D and give the dimension size explicit to the compiler everytime a point2D is used. With this it is possible to use loop unrolling, since the loop size is now know at compile time. This results in a fair comparision of the assembler code to the C++ version.
Yes, using optimization level 2 can make some code better. But not in this trivial examples. No, level 3 dosen't give any better results for this examples.
Element Center
C++
as member function
"mesh" and "IE" are const and can't be modified.
auto Mesh::elementCenter(const int IE) const { return (XY[INE[IE][0]] + XY[INE[IE][1]] + XY[INE[IE][2]]) / 3; }
Julia
as free function
constness? performance?
Pay attention to the dot by the division.
function elementCenter(mesh::Mesh, IE::Int32) Elem = mesh.INE[:,IE] tmp = (mesh.XY[Elem[1]] + mesh.XY[Elem[2]] + mesh.XY[Elem[3]]) ./ 3.0 return tmp end
FORTRAN
as member function (and with a few small changes as free function)
"this" the mesh and "IE" are const and can't be modified.
Don't use sum() for performance reasons.
The first dimension of array XY and INE is not known at compile time
function elementCenter(this, IE) result(center) implicit none class(Mesh_t), intent(in) :: this integer, intent(in) :: IE real(8) :: center(2) center = (this%XY(1:2,this%INE(1,IE)) + this%XY(1:2,this%INE(2,IE)) + this%XY(1:2,this%INE(3,IE))) / 3 end function
Rust
Its my first Rust code. Could be done better.
pub struct Point2D { xy : [f64;2] } pub struct Mesh { xy : Vec<Point2D>, INE : Vec<[usize;3]> } impl Mesh { pub fn element_center(&self, IE : usize) -> Point2D { return Point2D {xy:[ (self.xy[self.INE[IE][0]].xy[0] + self.xy[self.INE[IE][1]].xy[0] + self.xy[self.INE[IE][2]].xy[0]) / 3.0 , (self.xy[self.INE[IE][0]].xy[1] + self.xy[self.INE[IE][1]].xy[1] + self.xy[self.INE[IE][2]].xy[1]) / 3.0 ]}; } }
The assembly output show the differences. There is a lot more assembler instructions for FORTRAN since it don't use packed SSE instruction, and do some extra offset calculation I guess.
C++
Mesh::elementCenter(int) const: movslq %esi, %rsi leaq (%rsi,%rsi,2), %rdx movq 24(%rdi), %rax leaq (%rax,%rdx,4), %rax movq (%rdi), %rdx # calc INE offsets movslq (%rax), %rsi salq $4, %rsi movslq 4(%rax), %rcx salq $4, %rcx movapd (%rdx,%rsi), %xmm0 # xmm0 = p0 addpd (%rdx,%rcx), %xmm0 # xmm0 += p1 movslq 8(%rax), %rax salq $4, %rax addpd (%rdx,%rax), %xmm0 # xmm0 += p2 divpd .LC0(%rip), %xmm0 # xmm0 /= 3 ret # return xmm0 as a new Point2D .LC0: .long 0 .long 1074266112 .long 0 .long 1074266112 |
FORTRAN
_testmodule_MOD_elementcenter: pushq %rbx movq 24(%rdi), %rax testq %rax, %rax movl $1, %r8d cmovne %rax, %r8 movq (%rdi), %r9 movq (%rsi), %rcx movq 8(%rsi), %r11 movslq (%rdx), %rdx imulq 120(%rsi), %rdx addq 80(%rsi), %rdx movq 72(%rsi), %rbx leaq (%rbx,%rdx,4), %r10 # calc INE offsets. lot of stuff movq 24(%rsi), %rax movq 48(%rsi), %rdi movslq 4(%r10), %rbx imulq %rdi, %rbx addq %r11, %rbx movslq 8(%r10), %rdx imulq %rdi, %rdx addq %r11, %rdx movslq 12(%r10), %rsi imulq %rsi, %rdi leaq (%rdi,%r11), %r11 leaq (%rax,%rbx), %r10 leaq (%rax,%rdx), %rdi movsd (%rcx,%r10,8), %xmm0 # xmm0 = p0.x addsd (%rcx,%rdi,8), %xmm0 # xmm0 += p1.x leaq (%rax,%r11), %rsi addsd (%rcx,%rsi,8), %xmm0 # xmm0 += p2.x movsd .LC0(%rip), %xmm1 # xmm1 = 3 divsd %xmm1, %xmm0 # xmm0 /= xmm1 movsd %xmm0, (%r9) # write x result back to center array addq %rax, %rax leaq (%rbx,%rax), %rbx # calc offset addq %rax, %rdx movsd (%rcx,%rbx,8), %xmm2 # xmm2 = p0.y addsd (%rcx,%rdx,8), %xmm2 # xmm2 += p1.y addq %r11, %rax addsd (%rcx,%rax,8), %xmm2 # xmm2 += p2.y divsd %xmm1, %xmm2 # xmm2 /= xmm1 movsd %xmm2, (%r9,%r8,8) # write y result back to center array popq %rbx ret .LC0: .long 0 .long 1074266112 |
Rust
example::Mesh::element_center: pushq %rax movq %rsi, %rcx movq 40(%rdi), %rsi cmpq %rcx, %rsi jbe .LBB0_1 movq %rdi, %r8 movq 16(%rdi), %rsi movq 24(%rdi), %rax leaq (%rcx,%rcx,2), %rdx movq (%rax,%rdx,8), %rcx cmpq %rcx, %rsi jbe .LBB0_4 movq 8(%rax,%rdx,8), %rdi cmpq %rdi, %rsi jbe .LBB0_8 movq 16(%rax,%rdx,8), %rax cmpq %rax, %rsi jbe .LBB0_9 movq (%r8), %rdx shlq $4, %rdi shlq $4, %rcx movsd (%rdx,%rcx), %xmm0 movsd 8(%rdx,%rcx), %xmm1 addsd (%rdx,%rdi), %xmm0 shlq $4, %rax addsd (%rdx,%rax), %xmm0 movsd .LCPI0_0(%rip), %xmm2 addsd 8(%rdx,%rdi), %xmm1 addsd 8(%rdx,%rax), %xmm1 divsd %xmm2, %xmm0 divsd %xmm2, %xmm1 movq %xmm0, %rax movq %xmm1, %rdx popq %rcx retq .LBB0_1: leaq .L__unnamed_1(%rip), %rdx jmp .LBB0_2 .LBB0_4: leaq .L__unnamed_2(%rip), %rdx .LBB0_2: movq %rcx, %rdi callq *core::panicking::panic_bounds_check@GOTPCREL(%rip) ud2 .LBB0_8: leaq .L__unnamed_3(%rip), %rdx callq *core::panicking::panic_bounds_check@GOTPCREL(%rip) ud2 .LBB0_9: leaq .L__unnamed_4(%rip), %rdx movq %rax, %rdi callq *core::panicking::panic_bounds_check@GOTPCREL(%rip) ud2 |
Element Area
C++
as member function
"mesh" and "IE" are const and can't be modified.
The long version.
auto Mesh::elementArea(const int IE) const { return 0.5*((XY[INE[IE][0]].x()-XY[INE[IE][2]].x()) * (XY[INE[IE][1]].y()-XY[INE[IE][0]].y()) + (XY[INE[IE][1]].x()-XY[INE[IE][0]].x()) * (XY[INE[IE][2]].y()-XY[INE[IE][0]].y()) ); }
Shorter version with zero extra costs
auto Mesh::elementArea(const int IE) const { const auto& p0 = XY[INE[IE][0]]; const auto& p1 = XY[INE[IE][1]]; const auto& p2 = XY[INE[IE][2]]; return 0.5*((p0.x()-p2.x()) * (p1.y()-p0.y()) + (p1.x()-p0.x()) * (p2.y()-p0.y()) ); }
Julia
as free function
constness? performance?
function elementArea(mesh::Mesh, IE::Int32) Elem = mesh.INE[:,IE] return 0.5*( (mesh.XY[Elem[1]].x - mesh.XY[Elem[3]].x) * (mesh.XY[Elem[2]].y - mesh.XY[Elem[1]].y) + (mesh.XY[Elem[2]].x - mesh.XY[Elem[1]].x) * (mesh.XY[Elem[3]].y - mesh.XY[Elem[1]].y) ) end
FORTRAN
as member function (and with a few small changes as free function)
"this" the mesh and "IE" are const and can't be modified.
The first dimension of array XY and INE is not known at compile time
The long version
function elementArea(this, IE) result(area) implicit none type(Mesh_t), intent(in) :: this integer, intent(in) :: IE real(8) :: area area = 0.5*( (this%XY(1,this%INE(1,IE))-this%XY(1,this%INE(3,IE))) * (this%XY(2,this%INE(2,IE))-this%XY(2,this%INE(1,IE))) + (this%XY(1,this%INE(2,IE))-this%XY(1,this%INE(1,IE))) * (this%XY(2,this%INE(3,IE))- this%XY(2,this%INE(1,IE))) ) end function
Shorter version
One have to work with pointers to prevent copys. Now the bound checking don't work any longer
function elementArea(this, IE) result(area) implicit none type(Mesh_t), intent(in) :: this integer, intent(in) :: IE real(8) :: area real(8), pointer :: p1(:), p2(:), p3(:) p1 => this%XY(1:2,this%INE(1,IE)) p2 => this%XY(1:2,this%INE(2,IE)) p3 => this%XY(1:2,this%INE(3,IE)) area = 0.5*( (p1(1)-p3(1)) * (p2(2)-p1(2)) + (p2(1)-p1(1)) * (p3(2)- p1(2)) ) end function
The assembly output show the differences. First the two C++ versions. As you can see, the versions are identical except of the ordering of one instruction. The short version of FORTRAN has two more assembler instruction. So there are no zero costs for syntax sugar.
C++ Long version
Mesh::elementArea(int) const: movslq %esi, %rsi leaq (%rsi,%rsi,2), %rdx movq 24(%rdi), %rax leaq (%rax,%rdx,4), %rsi movq (%rdi), %rdx movslq (%rsi), %rcx salq $4, %rcx addq %rdx, %rcx movsd (%rcx), %xmm2 movslq 8(%rsi), %rax salq $4, %rax addq %rdx, %rax movslq 4(%rsi), %rsi salq $4, %rsi addq %rsi, %rdx movsd 8(%rcx), %xmm3 movapd %xmm2, %xmm0 subsd (%rax), %xmm0 movsd 8(%rdx), %xmm1 subsd %xmm3, %xmm1 mulsd %xmm1, %xmm0 movsd (%rdx), %xmm1 subsd %xmm2, %xmm1 movsd 8(%rax), %xmm2 subsd %xmm3, %xmm2 mulsd %xmm2, %xmm1 addsd %xmm1, %xmm0 mulsd .LC0(%rip), %xmm0 ret .LC0: .long 0 .long 1071644672 mov 12 |
C++ Short version
Mesh::elementArea(int) const: movslq %esi, %rsi leaq (%rsi,%rsi,2), %rdx movq 24(%rdi), %rax leaq (%rax,%rdx,4), %rsi movq (%rdi), %rax movslq (%rsi), %rcx salq $4, %rcx addq %rax, %rcx movslq 4(%rsi), %rdx salq $4, %rdx addq %rax, %rdx movslq 8(%rsi), %rsi salq $4, %rsi addq %rsi, %rax movsd (%rcx), %xmm2 movsd 8(%rcx), %xmm3 movapd %xmm2, %xmm0 subsd (%rax), %xmm0 movsd 8(%rdx), %xmm1 subsd %xmm3, %xmm1 mulsd %xmm1, %xmm0 movsd (%rdx), %xmm1 subsd %xmm2, %xmm1 movsd 8(%rax), %xmm2 subsd %xmm3, %xmm2 mulsd %xmm2, %xmm1 addsd %xmm1, %xmm0 mulsd .LC0(%rip), %xmm0 ret .LC0: .long 0 .long 1071644672 mov 12 |
FORTRAN long version
__testmodule_MOD_elementarea: movq (%rdi), %r8 movq 8(%rdi), %r9 movslq (%rsi), %rax imulq 120(%rdi), %rax addq 80(%rdi), %rax movq 72(%rdi), %rdx leaq (%rdx,%rax,4), %r10 movq 48(%rdi), %rsi movq 24(%rdi), %rcx movslq 4(%r10), %r11 imulq %rsi, %r11 addq %r9, %r11 addq %rcx, %r11 movsd (%r8,%r11,8), %xmm1 movslq 12(%r10), %rdx imulq %rsi, %rdx addq %r9, %rdx addq %rcx, %rdx movslq 8(%r10), %rdi imulq %rdi, %rsi addq %rsi, %r9 leaq (%r9,%rcx,2), %r9 addq %rcx, %r11 movsd (%r8,%r11,8), %xmm3 movq %r9, %rax subq %rcx, %rax movsd (%r8,%rax,8), %xmm0 subsd %xmm1, %xmm0 addq %rdx, %rcx movsd (%r8,%rcx,8), %xmm2 subsd %xmm3, %xmm2 mulsd %xmm2, %xmm0 movapd %xmm1, %xmm4 subsd (%r8,%rdx,8), %xmm4 movsd (%r8,%r9,8), %xmm5 subsd %xmm3, %xmm5 mulsd %xmm4, %xmm5 addsd %xmm5, %xmm0 mulsd .LC0(%rip), %xmm0 ret .LC0: .long 0 .long 1071644672 mov 16 |
FORTRAN short version
__testmodule_MOD_elementarea: movslq (%rsi), %rax imulq 120(%rdi), %rax addq 80(%rdi), %rax movq 72(%rdi), %rdx leaq (%rdx,%rax,4), %r10 movq 24(%rdi), %rsi movq (%rdi), %r8 movl $1, %eax subq 32(%rdi), %rax imulq %rsi, %rax movq 56(%rdi), %r9 movq 48(%rdi), %rcx movslq 4(%r10), %rdi subq %r9, %rdi imulq %rcx, %rdi addq %rax, %rdi leaq (%r8,%rdi,8), %rdi movslq 8(%r10), %r11 subq %r9, %r11 imulq %rcx, %r11 addq %rax, %r11 leaq (%r8,%r11,8), %r11 movslq 12(%r10), %rdx subq %r9, %rdx imulq %rdx, %rcx addq %rcx, %rax leaq (%r8,%rax,8), %r10 movsd (%rdi), %xmm1 movsd (%rdi,%rsi,8), %xmm3 movapd %xmm1, %xmm4 subsd (%r10), %xmm4 movsd (%r11,%rsi,8), %xmm2 subsd %xmm3, %xmm2 mulsd %xmm2, %xmm4 movsd (%r11), %xmm0 subsd %xmm1, %xmm0 movsd (%r10,%rsi,8), %xmm6 subsd %xmm3, %xmm6 mulsd %xmm6, %xmm0 addsd %xmm4, %xmm0 mulsd .LC0(%rip), %xmm0 ret .LC0: .long 0 .long 1071644672 mov 16 |