Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91790467
gradient_avx512f.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 14, 12:18
Size
12 KB
Mime Type
text/x-c
Expires
Sat, Nov 16, 12:18 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
22298464
Attached To
R1448 Lenstool-HPC
gradient_avx512f.cpp
View Options
/**
* @Author Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch)
* @date July 2017
* @version 0,1
*
*/
#include <iostream>
#include <string.h>
//#include <cuda_runtime.h>
#include <math.h>
#include <sys/time.h>
#include <fstream>
#include <immintrin.h>
//
#include "structure_hpc.hpp"
#include "simd_math_avx512f.h"
#include "gradient.hpp"
//#include "iacaMarks.h"
//
//
//
#ifdef __AVX_512F__
struct
point
module_potentialDerivatives_totalGradient_8_SOA_AVX512
(
const
struct
point
*
pImage
,
const
struct
Potential_SOA
*
lens
,
const
int
nhalos
)
{
struct
point
grad
,
clumpgrad
;
grad
.
x
=
0
;
grad
.
y
=
0
;
//
// smearing the image coordinates on registers
__m512d
image_x
=
_mm512_set1_pd
(
pImage
->
x
);
__m512d
image_y
=
_mm512_set1_pd
(
pImage
->
y
);
//
__m512d
__grad_x
=
_mm512_set1_pd
(
0.
);
__m512d
__grad_y
=
_mm512_set1_pd
(
0.
);
//
int
i
;
#pragma unroll
for
(
i
=
0
;
i
<
nhalos
-
nhalos
%
8
;
i
=
i
+
8
)
{
//IACA_START;
//
__m512d
two
=
_mm512_set1_pd
(
2.
);
__m512d
one
=
_mm512_set1_pd
(
1.
);
__m512d
zero
=
_mm512_set1_pd
(
0.
);
__m512d
half
=
_mm512_set1_pd
(
0.5
);
__m512d
mhalf
=
_mm512_set1_pd
(
-
0.5
);
//
// 2 loads
__m512d
true_coord_x
=
SUB
(
image_x
,
_mm512_loadu_pd
(
&
lens
->
position_x
[
i
]));
__m512d
true_coord_y
=
SUB
(
image_y
,
_mm512_loadu_pd
(
&
lens
->
position_y
[
i
]));
// 2 loafs
__m512d
rc
=
_mm512_loadu_pd
(
&
lens
->
rcore
[
i
]);
__m512d
b0
=
_mm512_loadu_pd
(
&
lens
->
b0
[
i
]);
// 2 loads
__m512d
eps
=
_mm512_loadu_pd
(
&
lens
->
ellipticity_potential
[
i
]);
//
__m512d
one_minus_eps
=
SUB
(
one
,
eps
);
__m512d
one_plus_eps
=
ADD
(
one
,
eps
);
__m512d
one_plus_eps_rcp
=
__INV
(
one_plus_eps
);
// 1 load
__m512d
theta
=
_mm512_loadu_pd
(
&
lens
->
ellipticity_angle
[
i
]);
/*positionning at the potential center*/
__m512d
cos_theta
=
_mm512_cos_pd
(
theta
);
__m512d
sin_theta
=
_mm512_sin_pd
(
theta
);
// rotation: 6 ops
__m512d
x
=
ADD
(
_mm512_mul_pd
(
true_coord_x
,
cos_theta
),
_mm512_mul_pd
(
true_coord_y
,
sin_theta
));
__m512d
y
=
SUB
(
_mm512_mul_pd
(
true_coord_y
,
cos_theta
),
_mm512_mul_pd
(
true_coord_x
,
sin_theta
));
//
__m512d
sqe
=
__SQRT
(
eps
);
//
__m512d
cx1
=
_mm512_mul_pd
(
one_minus_eps
,
one_plus_eps_rcp
);
// (1. - eps)/(1. + eps); 3 ops
//__m512d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops
__m512d
cxro
=
MUL
(
one_plus_eps
,
one_plus_eps
);
// (1. + eps)*(1. + eps); 3 ops
__m512d
cyro
=
MUL
(
one_minus_eps
,
one_minus_eps
);
// (1. - eps)*(1. - eps); 3 ops
__m512d
rem2
=
ADD
(
MUL
(
MUL
(
x
,
x
),
__INV
(
cxro
)),
MUL
(
MUL
(
y
,
y
),
__INV
(
cyro
)));
// x*x/(cxro) + y*y/(cyro); ~5 ops
//
__m512d
zci_re
=
zero
;
__m512d
zci_im
=
MUL
(
mhalf
,
MUL
(
SUB
(
one
,
MUL
(
eps
,
eps
)),
__INV
(
sqe
)));
// ~4 ops
// 2.*sqe*sqrt(rc*rc + rem2) - y/cx1, 7 ops
__m512d
znum_re
=
MUL
(
cx1
,
x
);
//__m512d znum_im = two*sqe*__SQRT(rc*rc + rem2) - y*__INV(cx1); // ~4 ops
__m512d
znum_im
=
SUB
(
MUL
(
two
,
MUL
(
sqe
,
__SQRT
(
ADD
(
MUL
(
rc
,
rc
),
rem2
)))),
MUL
(
y
,
__INV
(
cx1
)));
// ~4 ops
//
__m512d
zden_re
=
x
;
__m512d
zden_im
=
_mm512_mul_pd
(
_mm512_set1_pd
(
2.
),
_mm512_mul_pd
(
rc
,
sqe
));
//
//
zden_im
=
_mm512_sub_pd
(
zden_im
,
y
);
//norm = (zden.re*zden.re + zden.im*zden.im); 3 ops
__m512d
norm
=
ADD
(
MUL
(
zden_re
,
zden_re
),
MUL
(
zden_im
,
zden_im
));
__m512d
zis_re
=
MUL
(
ADD
(
MUL
(
znum_re
,
zden_re
),
MUL
(
znum_im
,
zden_im
)),
__INV
(
norm
));
// 3 ops
__m512d
zis_im
=
MUL
(
SUB
(
MUL
(
znum_im
,
zden_re
),
MUL
(
znum_re
,
zden_im
)),
__INV
(
norm
));
// 3 ops
//
norm
=
zis_re
;
//
zis_re
=
_mm512_log_pd
(
__SQRT
(
ADD
(
MUL
(
norm
,
norm
),
MUL
(
zis_im
,
zis_im
))));
// 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis)
zis_im
=
_mm512_atan2_pd
(
zis_im
,
norm
);
//
//
__m512d
zres_re
=
SUB
(
MUL
(
zci_re
,
zis_re
),
MUL
(
zci_im
,
zis_im
));
// Re( zci*ln(zis) ) 3 ops
__m512d
zres_im
=
ADD
(
MUL
(
zci_im
,
zis_re
),
MUL
(
zis_im
,
zci_re
));
// Im( zci*ln(zis) ) 3 ops
//
zis_re
=
MUL
(
b0
,
zres_re
);
zis_im
=
MUL
(
b0
,
zres_im
);
//
cos_theta
=
_mm512_cos_pd
(
SUB
(
zero
,
theta
));
sin_theta
=
_mm512_sin_pd
(
SUB
(
zero
,
theta
));
// rotation: 6 ops
__grad_x
=
ADD
(
__grad_x
,
_mm512_add_pd
(
_mm512_mul_pd
(
zis_re
,
cos_theta
),
_mm512_mul_pd
(
zis_im
,
sin_theta
)));
__grad_y
=
ADD
(
__grad_y
,
_mm512_sub_pd
(
_mm512_mul_pd
(
zis_im
,
cos_theta
),
_mm512_mul_pd
(
zis_re
,
sin_theta
)));
//
}
//
grad
.
x
=
((
double
*
)
&
__grad_x
)[
0
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
1
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
2
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
3
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
4
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
5
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
6
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
7
];
//
grad
.
y
=
((
double
*
)
&
__grad_y
)[
0
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
1
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
2
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
3
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
4
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
5
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
6
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
7
];
//
// end of peeling
//
if
(
nhalos
%
8
>
0
)
{
struct
point
grad_peel
;
grad_peel
=
module_potentialDerivatives_totalGradient_8_SOA
(
pImage
,
lens
,
i
,
nhalos
%
8
);
//
grad
.
x
+=
grad_peel
.
x
;
grad
.
y
+=
grad_peel
.
y
;
}
//
//IACA_END;
//
return
(
grad
);
}
struct
point
module_potentialDerivatives_totalGradient_81_SOA_AVX512
(
const
struct
point
*
pImage
,
const
struct
Potential_SOA
*
lens
,
const
int
nhalos
)
{
//_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
struct
point
grad
,
clumpgrad
;
grad
.
x
=
0
;
grad
.
y
=
0
;
//
// smearing the image coordinates on registers
__m512d
image_x
=
_mm512_set1_pd
(
pImage
->
x
);
__m512d
image_y
=
_mm512_set1_pd
(
pImage
->
y
);
//
__m512d
__grad_x
=
_mm512_set1_pd
(
0.
);
__m512d
__grad_y
=
_mm512_set1_pd
(
0.
);
//
int
i
;
#pragma unroll
for
(
i
=
0
;
i
<
nhalos
-
nhalos
%
8
;
i
=
i
+
8
)
{
//IACA_START;
//
__m512d
two
=
_mm512_set1_pd
(
2.
);
__m512d
one
=
_mm512_set1_pd
(
1.
);
__m512d
zero
=
_mm512_set1_pd
(
0.
);
__m512d
half
=
_mm512_set1_pd
(
0.5
);
__m512d
mhalf
=
_mm512_set1_pd
(
-
0.5
);
//
// 2 loads
__m512d
true_coord_x
=
_mm512_sub_pd
(
image_x
,
_mm512_loadu_pd
(
&
lens
->
position_x
[
i
]));
__m512d
true_coord_y
=
_mm512_sub_pd
(
image_y
,
_mm512_loadu_pd
(
&
lens
->
position_y
[
i
]));
// 3 loads
__m512d
rc
=
_mm512_loadu_pd
(
&
lens
->
rcore
[
i
]);
__m512d
rcut
=
_mm512_loadu_pd
(
&
lens
->
rcut
[
i
]);
__m512d
b0
=
_mm512_loadu_pd
(
&
lens
->
b0
[
i
]);
__m512d
t05
=
MUL
(
MUL
(
b0
,
rcut
),
__INV
(
SUB
(
rcut
,
rc
)));
// 1 loads
__m512d
eps
=
_mm512_loadu_pd
(
&
lens
->
ellipticity_potential
[
i
]);
//
__m512d
one_minus_eps
=
_mm512_sub_pd
(
one
,
eps
);
__m512d
one_plus_eps
=
_mm512_add_pd
(
one
,
eps
);
__m512d
one_plus_eps_rcp
=
__INV
(
one_plus_eps
);
// 1 load
__m512d
theta
=
_mm512_loadu_pd
(
&
lens
->
ellipticity_angle
[
i
]);
/*positionning at the potential center*/
//__m512d cos_theta = _mm512_cos_pd(theta);
//__m512d sin_theta = _mm512_sin_pd(theta);
__m512d
cos_theta
;
__m512d
sin_theta
=
_mm512_sincos_pd
(
&
cos_theta
,
theta
);
// rotation: 6 ops
__m512d
x
=
_mm512_add_pd
(
_mm512_mul_pd
(
true_coord_x
,
cos_theta
),
_mm512_mul_pd
(
true_coord_y
,
sin_theta
));
__m512d
y
=
_mm512_sub_pd
(
_mm512_mul_pd
(
true_coord_y
,
cos_theta
),
_mm512_mul_pd
(
true_coord_x
,
sin_theta
));
//
__m512d
sqe
=
__SQRT
(
eps
);
//
__m512d
cx1
=
_mm512_mul_pd
(
one_minus_eps
,
one_plus_eps_rcp
);
// (1. - eps)/(1. + eps); 3 ops
//__m512d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops
__m512d
cxro
=
MUL
(
one_plus_eps
,
one_plus_eps
);
// (1. + eps)*(1. + eps); 3 ops
__m512d
cyro
=
MUL
(
one_minus_eps
,
one_minus_eps
);
// (1. - eps)*(1. - eps); 3 ops
__m512d
rem2
=
ADD
(
MUL
(
x
,
MUL
(
x
,
__INV
(
cxro
))),
MUL
(
y
,
MUL
(
y
,
__INV
(
cyro
))));
// x*x/(cxro) + y*y/(cyro); ~5 ops
//
__m512d
zci_re
=
zero
;
__m512d
zci_im
=
MUL
(
mhalf
,
MUL
(
SUB
(
one
,
MUL
(
eps
,
eps
)),
__INV
(
sqe
)));
// ~4 ops
//
// 2.*sqe*sqrt(rc*rc + rem2) - y/cx1, 7 ops
//
__m512d
znum_re
=
zero
,
znum_im
=
zero
;
__m512d
zden_re
=
zero
,
zden_im
=
zero
;
__m512d
norm
;
//
__m512d
zis_re
=
zero
,
zis_im
=
zero
;
__m512d
zres_rc_re
=
zero
,
zres_rc_im
=
zero
;
__m512d
zres_rcut_re
=
zero
,
zres_rcut_im
=
zero
;
//
// part 1
//
{
znum_re
=
MUL
(
cx1
,
x
);
znum_im
=
SUB
(
MUL
(
two
,
MUL
(
sqe
,
__SQRT
(
ADD
(
MUL
(
rc
,
rc
),
rem2
)))),
MUL
(
y
,
__INV
(
cx1
)));
// ~4 ops
//
zden_re
=
x
;
zden_im
=
_mm512_mul_pd
(
_mm512_set1_pd
(
2.
),
_mm512_mul_pd
(
rc
,
sqe
));
zden_im
=
_mm512_sub_pd
(
zden_im
,
y
);
//norm = (zden.re*zden.re + zden.im*zden.im); 3 ops
norm
=
ADD
(
MUL
(
zden_re
,
zden_re
),
MUL
(
zden_im
,
zden_im
));
zis_re
=
MUL
(
ADD
(
MUL
(
znum_re
,
zden_re
),
MUL
(
znum_im
,
zden_im
)),
__INV
(
norm
));
// 3 ops
zis_im
=
MUL
(
SUB
(
MUL
(
znum_im
,
zden_re
),
MUL
(
znum_re
,
zden_im
)),
__INV
(
norm
));
// 3 ops
//
norm
=
zis_re
;
//
zis_re
=
_mm512_log_pd
(
__SQRT
(
ADD
(
MUL
(
norm
,
norm
),
MUL
(
zis_im
,
zis_im
))));
// 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis)
zis_im
=
_mm512_atan2_pd
(
zis_im
,
norm
);
//
//
zres_rc_re
=
SUB
(
MUL
(
zci_re
,
zis_re
),
MUL
(
zci_im
,
zis_im
));
// Re( zci*ln(zis) ) 3 ops
zres_rc_im
=
ADD
(
MUL
(
zci_im
,
zis_re
),
MUL
(
zis_im
,
zci_re
));
// Im( zci*ln(zis) ) 3 ops
}
//
// part 2
//
{
znum_re
=
MUL
(
cx1
,
x
);
znum_im
=
SUB
(
MUL
(
two
,
MUL
(
sqe
,
__SQRT
(
ADD
(
MUL
(
rcut
,
rcut
),
rem2
)))),
MUL
(
y
,
__INV
(
cx1
)));
// ~4 ops
//
zden_re
=
x
;
zden_im
=
_mm512_mul_pd
(
_mm512_set1_pd
(
2.
),
_mm512_mul_pd
(
rcut
,
sqe
));
zden_im
=
_mm512_sub_pd
(
zden_im
,
y
);
//norm = (zden.re*zden.re + zden.im*zden.im); 3 ops
//
//norm = (zden_re*zden_re + zden_im*zden_im);
norm
=
ADD
(
MUL
(
zden_re
,
zden_re
),
MUL
(
zden_im
,
zden_im
));
//
zis_re
=
MUL
(
ADD
(
MUL
(
znum_re
,
zden_re
),
MUL
(
znum_im
,
zden_im
)),
__INV
(
norm
));
// 3 ops
zis_im
=
MUL
(
SUB
(
MUL
(
znum_im
,
zden_re
),
MUL
(
znum_re
,
zden_im
)),
__INV
(
norm
));
// 3 ops
//zis_re = (znum_re*zden_re + znum_im*zden_im)*__INV(norm); // 3 ops
//zis_im = (znum_im*zden_re - znum_re*zden_im)*__INV(norm); // 3 ops
//
norm
=
zis_re
;
//
//zis_re = _mm512_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis)
zis_re
=
_mm512_log_pd
(
__SQRT
(
ADD
(
MUL
(
norm
,
norm
),
MUL
(
zis_im
,
zis_im
))));
// 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis)
zis_im
=
_mm512_atan2_pd
(
zis_im
,
norm
);
//
//
//zres_rcut_re = (zci_re*zis_re - zci_im*zis_im); // Re( zci*ln(zis) ) 3 ops
//zres_rcut_im = (zci_im*zis_re + zis_im*zci_re); // Im( zci*ln(zis) ) 3 ops
zres_rcut_re
=
SUB
(
MUL
(
zci_re
,
zis_re
),
MUL
(
zci_im
,
zis_im
));
// Re( zci*ln(zis) ) 3 ops
zres_rcut_im
=
ADD
(
MUL
(
zci_im
,
zis_re
),
MUL
(
zis_im
,
zci_re
));
// Im( zci*ln(zis) ) 3 ops
}
//
//
//
zis_re
=
MUL
(
t05
,
SUB
(
zres_rc_re
,
zres_rcut_re
));
zis_im
=
MUL
(
t05
,
SUB
(
zres_rc_im
,
zres_rcut_im
));
//
//cos_theta = _mm512_cos_pd(zero - theta);
//sin_theta = _mm512_sin_pd(zero - theta);
cos_theta
;
sin_theta
=
_mm512_sincos_pd
(
&
cos_theta
,
SUB
(
zero
,
theta
));
// rotation: 6 ops
__grad_x
=
ADD
(
__grad_x
,
_mm512_add_pd
(
_mm512_mul_pd
(
zis_re
,
cos_theta
),
_mm512_mul_pd
(
zis_im
,
sin_theta
)));
__grad_y
=
ADD
(
__grad_y
,
_mm512_sub_pd
(
_mm512_mul_pd
(
zis_im
,
cos_theta
),
_mm512_mul_pd
(
zis_re
,
sin_theta
)));
//
}
//
grad
.
x
=
((
double
*
)
&
__grad_x
)[
0
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
1
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
2
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
3
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
4
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
5
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
6
];
grad
.
x
+=
((
double
*
)
&
__grad_x
)[
7
];
//
grad
.
y
=
((
double
*
)
&
__grad_y
)[
0
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
1
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
2
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
3
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
4
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
5
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
6
];
grad
.
y
+=
((
double
*
)
&
__grad_y
)[
7
];
//
// end of peeling
//
for
(;
i
<
nhalos
;
++
i
)
{
struct
point
true_coord
;
true_coord
.
x
=
pImage
->
x
-
lens
->
position_x
[
i
];
true_coord
.
y
=
pImage
->
y
-
lens
->
position_y
[
i
];
//printf("x, y = %f, %f\n", lens->position.x, lens->position.y);
struct
point
true_coord_rot
=
rotateCoordinateSystem
(
true_coord
,
lens
->
ellipticity_angle
[
i
]);
//
complex
zis
,
zis_cut
;
double
t05
=
lens
->
b0
[
i
]
*
lens
->
rcut
[
i
]
/
(
lens
->
rcut
[
i
]
-
lens
->
rcore
[
i
]);
zis
=
piemd_1derivatives_ci05
(
true_coord_rot
.
x
,
true_coord_rot
.
y
,
lens
->
ellipticity_potential
[
i
],
lens
->
rcore
[
i
]);
//
zis_cut
=
piemd_1derivatives_ci05
(
true_coord_rot
.
x
,
true_coord_rot
.
y
,
lens
->
ellipticity_potential
[
i
],
lens
->
rcut
[
i
]);
struct
point
clumpgrad
;
clumpgrad
.
x
=
t05
*
(
zis
.
re
-
zis_cut
.
re
);
clumpgrad
.
y
=
t05
*
(
zis
.
im
-
zis_cut
.
im
);
clumpgrad
=
rotateCoordinateSystem
(
clumpgrad
,
-
lens
->
ellipticity_angle
[
i
]);
//
grad
.
x
+=
clumpgrad
.
x
;
grad
.
y
+=
clumpgrad
.
y
;
}
//
//IACA_END;
//
return
(
grad
);
}
#endif
Event Timeline
Log In to Comment