C++Guns – RoboBlog blogging the bot

29.06.2018

C++ Julia FORTRAN RUST

Filed under: — Thomas @ 13:06

Page under construction

General statements:
Compiler: GNU GCC 8.1
Flags: -O1 --save-temps and -funroll-loops for FORTRAN

The C++ Point2D Type is simply a struct inherit from a SIMDvalarray to use the benefits of SSE, AVX.

One can not use something like a Point2D type in FORTRAN. One can program it, but not use. Because it is not possible to inline small function from other modules in FORTRAN. To pay a function call every time is not an option. Hence I use a variable size array for the two dimensions of Point2D and give the dimension size explicit to the compiler everytime a point2D is used. With this it is possible to use loop unrolling, since the loop size is now know at compile time. This results in a fair comparision of the assembler code to the C++ version.

Yes, using optimization level 2 can make some code better. But not in this trivial examples. No, level 3 dosen't give any better results for this examples.

Element Center

C++
as member function
"mesh" and "IE" are const and can't be modified.

auto Mesh::elementCenter(const int IE) const {
  return (XY[INE[IE][0]] + XY[INE[IE][1]] + XY[INE[IE][2]]) / 3;
}

Julia
as free function
constness? performance?
Pay attention to the dot by the division.

function elementCenter(mesh::Mesh, IE::Int32)
  Elem = mesh.INE[:,IE]
  tmp =  (mesh.XY[Elem[1]] + mesh.XY[Elem[2]] + mesh.XY[Elem[3]]) ./ 3.0
  return tmp
end

FORTRAN
as member function (and with a few small changes as free function)
"this" the mesh and "IE" are const and can't be modified.
Don't use sum() for performance reasons.
The first dimension of array XY and INE is not known at compile time

function elementCenter(this, IE) result(center)
  implicit none
  class(Mesh_t), intent(in) :: this
  integer, intent(in) :: IE
  real(8) :: center(2)
  center = (this%XY(1:2,this%INE(1,IE)) + this%XY(1:2,this%INE(2,IE)) + this%XY(1:2,this%INE(3,IE))) / 3
end function

Rust
Its my first Rust code. Could be done better.

pub struct Point2D {
    xy : [f64;2]
}

pub struct Mesh {
    xy : Vec<Point2D>,
    INE : Vec<[usize;3]>
}

impl Mesh {
    pub fn element_center(&self, IE : usize) -> Point2D {
      return Point2D {xy:[ (self.xy[self.INE[IE][0]].xy[0] + self.xy[self.INE[IE][1]].xy[0] + self.xy[self.INE[IE][2]].xy[0]) / 3.0 , (self.xy[self.INE[IE][0]].xy[1] + self.xy[self.INE[IE][1]].xy[1] + self.xy[self.INE[IE][2]].xy[1]) / 3.0  ]};
    }
}

The assembly output show the differences. There is a lot more assembler instructions for FORTRAN since it don't use packed SSE instruction, and do some extra offset calculation I guess.

C++

Mesh::elementCenter(int) const:
  movslq %esi, %rsi
  leaq (%rsi,%rsi,2), %rdx
  movq 24(%rdi), %rax
  leaq (%rax,%rdx,4), %rax
  movq (%rdi), %rdx          # calc INE offsets
  movslq (%rax), %rsi
  salq $4, %rsi
  movslq 4(%rax), %rcx
  salq $4, %rcx
  movapd (%rdx,%rsi), %xmm0  # xmm0 = p0
  addpd (%rdx,%rcx), %xmm0   # xmm0 += p1
  movslq 8(%rax), %rax
  salq $4, %rax
  addpd (%rdx,%rax), %xmm0   # xmm0 += p2
  divpd .LC0(%rip), %xmm0    # xmm0 /= 3
  ret                        # return xmm0 as a new Point2D 
.LC0:
  .long 0
  .long 1074266112
  .long 0
  .long 1074266112
FORTRAN

_testmodule_MOD_elementcenter:
	pushq	%rbx
	movq	24(%rdi), %rax
	testq	%rax, %rax
	movl	$1, %r8d
	cmovne	%rax, %r8
	movq	(%rdi), %r9
	movq	(%rsi), %rcx
	movq	8(%rsi), %r11
	movslq	(%rdx), %rdx
	imulq	120(%rsi), %rdx
	addq	80(%rsi), %rdx
	movq	72(%rsi), %rbx
	leaq	(%rbx,%rdx,4), %r10   # calc INE offsets. lot of stuff
	movq	24(%rsi), %rax
	movq	48(%rsi), %rdi
	movslq	4(%r10), %rbx
	imulq	%rdi, %rbx
	addq	%r11, %rbx
	movslq	8(%r10), %rdx
	imulq	%rdi, %rdx
	addq	%r11, %rdx
	movslq	12(%r10), %rsi
	imulq	%rsi, %rdi
	leaq	(%rdi,%r11), %r11
	leaq	(%rax,%rbx), %r10
	leaq	(%rax,%rdx), %rdi
	movsd	(%rcx,%r10,8), %xmm0   # xmm0 = p0.x
	addsd	(%rcx,%rdi,8), %xmm0   # xmm0 += p1.x
	leaq	(%rax,%r11), %rsi
	addsd	(%rcx,%rsi,8), %xmm0   # xmm0 += p2.x
	movsd	.LC0(%rip), %xmm1      # xmm1 = 3
	divsd	%xmm1, %xmm0           # xmm0 /= xmm1
	movsd	%xmm0, (%r9)           # write x result back to center array
	addq	%rax, %rax
	leaq	(%rbx,%rax), %rbx      # calc offset
	addq	%rax, %rdx
	movsd	(%rcx,%rbx,8), %xmm2   # xmm2 = p0.y
	addsd	(%rcx,%rdx,8), %xmm2   # xmm2 += p1.y
	addq	%r11, %rax
	addsd	(%rcx,%rax,8), %xmm2   # xmm2 += p2.y
	divsd	%xmm1, %xmm2           # xmm2 /= xmm1
	movsd	%xmm2, (%r9,%r8,8)     # write y result back to center array
	popq	%rbx
	ret
.LC0:
	.long	0
	.long	1074266112
Rust

example::Mesh::element_center:
        pushq   %rax
        movq    %rsi, %rcx
        movq    40(%rdi), %rsi
        cmpq    %rcx, %rsi
        jbe     .LBB0_1
        movq    %rdi, %r8
        movq    16(%rdi), %rsi
        movq    24(%rdi), %rax
        leaq    (%rcx,%rcx,2), %rdx
        movq    (%rax,%rdx,8), %rcx
        cmpq    %rcx, %rsi
        jbe     .LBB0_4
        movq    8(%rax,%rdx,8), %rdi
        cmpq    %rdi, %rsi
        jbe     .LBB0_8
        movq    16(%rax,%rdx,8), %rax
        cmpq    %rax, %rsi
        jbe     .LBB0_9
        movq    (%r8), %rdx
        shlq    $4, %rdi
        shlq    $4, %rcx
        movsd   (%rdx,%rcx), %xmm0
        movsd   8(%rdx,%rcx), %xmm1
        addsd   (%rdx,%rdi), %xmm0
        shlq    $4, %rax
        addsd   (%rdx,%rax), %xmm0
        movsd   .LCPI0_0(%rip), %xmm2
        addsd   8(%rdx,%rdi), %xmm1
        addsd   8(%rdx,%rax), %xmm1
        divsd   %xmm2, %xmm0
        divsd   %xmm2, %xmm1
        movq    %xmm0, %rax
        movq    %xmm1, %rdx
        popq    %rcx
        retq
.LBB0_1:
        leaq    .L__unnamed_1(%rip), %rdx
        jmp     .LBB0_2
.LBB0_4:
        leaq    .L__unnamed_2(%rip), %rdx
.LBB0_2:
        movq    %rcx, %rdi
        callq   *core::panicking::panic_bounds_check@GOTPCREL(%rip)
        ud2
.LBB0_8:
        leaq    .L__unnamed_3(%rip), %rdx
        callq   *core::panicking::panic_bounds_check@GOTPCREL(%rip)
        ud2
.LBB0_9:
        leaq    .L__unnamed_4(%rip), %rdx
        movq    %rax, %rdi
        callq   *core::panicking::panic_bounds_check@GOTPCREL(%rip)
        ud2

Element Area

C++
as member function
"mesh" and "IE" are const and can't be modified.
The long version.

auto Mesh::elementArea(const int IE) const {
  return 0.5*((XY[INE[IE][0]].x()-XY[INE[IE][2]].x()) * (XY[INE[IE][1]].y()-XY[INE[IE][0]].y()) + (XY[INE[IE][1]].x()-XY[INE[IE][0]].x()) * (XY[INE[IE][2]].y()-XY[INE[IE][0]].y()) );
}

Shorter version with zero extra costs

auto Mesh::elementArea(const int IE) const {
  const auto& p0 = XY[INE[IE][0]];
  const auto& p1 = XY[INE[IE][1]];
  const auto& p2 = XY[INE[IE][2]];
  return 0.5*((p0.x()-p2.x()) * (p1.y()-p0.y()) + (p1.x()-p0.x()) * (p2.y()-p0.y()) );
}

Julia
as free function
constness? performance?

function elementArea(mesh::Mesh, IE::Int32)
  Elem = mesh.INE[:,IE]
  return  0.5*( (mesh.XY[Elem[1]].x - mesh.XY[Elem[3]].x) * (mesh.XY[Elem[2]].y - mesh.XY[Elem[1]].y) + (mesh.XY[Elem[2]].x - mesh.XY[Elem[1]].x) * (mesh.XY[Elem[3]].y - mesh.XY[Elem[1]].y) )
end

FORTRAN
as member function (and with a few small changes as free function)
"this" the mesh and "IE" are const and can't be modified.
The first dimension of array XY and INE is not known at compile time
The long version

function elementArea(this, IE) result(area)
  implicit none
  type(Mesh_t), intent(in) :: this
  integer, intent(in) :: IE
  real(8) :: area
  area = 0.5*( (this%XY(1,this%INE(1,IE))-this%XY(1,this%INE(3,IE))) * (this%XY(2,this%INE(2,IE))-this%XY(2,this%INE(1,IE))) + (this%XY(1,this%INE(2,IE))-this%XY(1,this%INE(1,IE))) * (this%XY(2,this%INE(3,IE))- this%XY(2,this%INE(1,IE))) )
end function

Shorter version
One have to work with pointers to prevent copys. Now the bound checking don't work any longer

function elementArea(this, IE) result(area)
  implicit none
  type(Mesh_t), intent(in) :: this
  integer, intent(in) :: IE
  real(8) :: area
  real(8), pointer :: p1(:), p2(:), p3(:)
  p1 => this%XY(1:2,this%INE(1,IE))
  p2 => this%XY(1:2,this%INE(2,IE))
  p3 => this%XY(1:2,this%INE(3,IE))
  area = 0.5*( (p1(1)-p3(1)) * (p2(2)-p1(2)) + (p2(1)-p1(1)) * (p3(2)- p1(2)) )
end function

The assembly output show the differences. First the two C++ versions. As you can see, the versions are identical except of the ordering of one instruction. The short version of FORTRAN has two more assembler instruction. So there are no zero costs for syntax sugar.

C++ Long version

Mesh::elementArea(int) const:
  movslq %esi, %rsi
  leaq (%rsi,%rsi,2), %rdx
  movq 24(%rdi), %rax
  leaq (%rax,%rdx,4), %rsi
  movq (%rdi), %rdx
  movslq (%rsi), %rcx
  salq $4, %rcx
  addq %rdx, %rcx
  movsd (%rcx), %xmm2
  movslq 8(%rsi), %rax
  salq $4, %rax
  addq %rdx, %rax
  movslq 4(%rsi), %rsi
  salq $4, %rsi
  addq %rsi, %rdx
  movsd 8(%rcx), %xmm3
  movapd %xmm2, %xmm0
  subsd (%rax), %xmm0
  movsd 8(%rdx), %xmm1
  subsd %xmm3, %xmm1
  mulsd %xmm1, %xmm0
  movsd (%rdx), %xmm1
  subsd %xmm2, %xmm1
  movsd 8(%rax), %xmm2
  subsd %xmm3, %xmm2
  mulsd %xmm2, %xmm1
  addsd %xmm1, %xmm0
  mulsd .LC0(%rip), %xmm0
  ret
.LC0:
  .long 0
  .long 1071644672

mov 12
lea 2
sal 3
add 4
sub 4
mul 3

C++ Short version

Mesh::elementArea(int) const:
  movslq %esi, %rsi
  leaq (%rsi,%rsi,2), %rdx
  movq 24(%rdi), %rax
  leaq (%rax,%rdx,4), %rsi
  movq (%rdi), %rax
  movslq (%rsi), %rcx
  salq $4, %rcx
  addq %rax, %rcx
  movslq 4(%rsi), %rdx
  salq $4, %rdx
  addq %rax, %rdx
  movslq 8(%rsi), %rsi
  salq $4, %rsi
  addq %rsi, %rax
  movsd (%rcx), %xmm2     
  movsd 8(%rcx), %xmm3
  movapd %xmm2, %xmm0
  subsd (%rax), %xmm0
  movsd 8(%rdx), %xmm1
  subsd %xmm3, %xmm1
  mulsd %xmm1, %xmm0
  movsd (%rdx), %xmm1
  subsd %xmm2, %xmm1
  movsd 8(%rax), %xmm2
  subsd %xmm3, %xmm2
  mulsd %xmm2, %xmm1
  addsd %xmm1, %xmm0
  mulsd .LC0(%rip), %xmm0
  ret
.LC0:
  .long 0
  .long 1071644672

mov 12
lea 2
sal 3
add 4
sub 4
mul 3

FORTRAN long version

__testmodule_MOD_elementarea:
	movq	(%rdi), %r8
	movq	8(%rdi), %r9
	movslq	(%rsi), %rax
	imulq	120(%rdi), %rax
	addq	80(%rdi), %rax
	movq	72(%rdi), %rdx
	leaq	(%rdx,%rax,4), %r10
	movq	48(%rdi), %rsi
	movq	24(%rdi), %rcx
	movslq	4(%r10), %r11
	imulq	%rsi, %r11
	addq	%r9, %r11
	addq	%rcx, %r11
	movsd	(%r8,%r11,8), %xmm1
	movslq	12(%r10), %rdx
	imulq	%rsi, %rdx
	addq	%r9, %rdx
	addq	%rcx, %rdx
	movslq	8(%r10), %rdi
	imulq	%rdi, %rsi
	addq	%rsi, %r9
	leaq	(%r9,%rcx,2), %r9
	addq	%rcx, %r11
	movsd	(%r8,%r11,8), %xmm3
	movq	%r9, %rax
	subq	%rcx, %rax
	movsd	(%r8,%rax,8), %xmm0
	subsd	%xmm1, %xmm0
	addq	%rdx, %rcx
	movsd	(%r8,%rcx,8), %xmm2
	subsd	%xmm3, %xmm2
	mulsd	%xmm2, %xmm0
	movapd	%xmm1, %xmm4
	subsd	(%r8,%rdx,8), %xmm4
	movsd	(%r8,%r9,8), %xmm5
	subsd	%xmm3, %xmm5
	mulsd	%xmm4, %xmm5
	addsd	%xmm5, %xmm0
	mulsd	.LC0(%rip), %xmm0
	ret
.LC0:
	.long	0
	.long	1071644672

mov 16
lea 2
add 9
sub 5
mul 7

FORTRAN short version

__testmodule_MOD_elementarea:
	movslq	(%rsi), %rax
	imulq	120(%rdi), %rax
	addq	80(%rdi), %rax
	movq	72(%rdi), %rdx
	leaq	(%rdx,%rax,4), %r10
	movq	24(%rdi), %rsi
	movq	(%rdi), %r8
	movl	$1, %eax
	subq	32(%rdi), %rax
	imulq	%rsi, %rax
	movq	56(%rdi), %r9
	movq	48(%rdi), %rcx
	movslq	4(%r10), %rdi
	subq	%r9, %rdi
	imulq	%rcx, %rdi
	addq	%rax, %rdi
	leaq	(%r8,%rdi,8), %rdi
	movslq	8(%r10), %r11
	subq	%r9, %r11
	imulq	%rcx, %r11
	addq	%rax, %r11
	leaq	(%r8,%r11,8), %r11
	movslq	12(%r10), %rdx
	subq	%r9, %rdx
	imulq	%rdx, %rcx
	addq	%rcx, %rax
	leaq	(%r8,%rax,8), %r10
	movsd	(%rdi), %xmm1
	movsd	(%rdi,%rsi,8), %xmm3
	movapd	%xmm1, %xmm4
	subsd	(%r10), %xmm4
	movsd	(%r11,%rsi,8), %xmm2
	subsd	%xmm3, %xmm2
	mulsd	%xmm2, %xmm4
	movsd	(%r11), %xmm0
	subsd	%xmm1, %xmm0
	movsd	(%r10,%rsi,8), %xmm6
	subsd	%xmm3, %xmm6
	mulsd	%xmm6, %xmm0
	addsd	%xmm4, %xmm0
	mulsd	.LC0(%rip), %xmm0
	ret
.LC0:
	.long	0
	.long	1071644672

mov 16
lea 4
add 5
sub 8
mul 8

No Comments

No comments yet.

RSS feed for comments on this post.

Sorry, the comment form is closed at this time.

Powered by WordPress