Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91649571
conv.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Nov 13, 02:10
Size
31 KB
Mime Type
text/x-c
Expires
Fri, Nov 15, 02:10 (2 d)
Engine
blob
Format
Raw Data
Handle
22261587
Attached To
R6289 Motion correction paper
conv.cpp
View Options
/*
Copyright (C) 2014 Jerome Revaud
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
*/
#include "conv.h"
#include "std.h"
#include "omp.h"
#include "maxfilter.h"
extern
"C"
{
#include <immintrin.h>
#define integer int
#define real float
extern
int
saxpy_
(
integer
*
n
,
real
*
sa
,
real
*
sx
,
integer
*
incx
,
real
*
sy
,
integer
*
incy
);
extern
int
sscal_
(
integer
*
n
,
real
*
sa
,
real
*
sx
,
integer
*
incx
);
}
static
inline
void
fast_set_val
(
float
*
__restrict__
a
,
long
d
,
const
float
val
)
{
if
(
val
)
{
int
j
;
for
(
j
=
0
;
j
<
d
;
j
++
)
a
[
j
]
=
val
;
}
else
memset
(
a
,
0
,
d
*
sizeof
(
float
));
}
static
inline
void
fast_add_val
(
float
*
__restrict__
a
,
long
d
,
const
float
val
)
{
int
j
;
for
(
j
=
0
;
j
<
d
;
j
++
)
a
[
j
]
+=
val
;
}
static
inline
void
fast_set_vec
(
float
*
__restrict__
dest
,
const
float
*
__restrict__
src
,
int
d
,
const
float
mul
)
{
if
(
mul
==
1
)
memcpy
(
dest
,
src
,
d
*
sizeof
(
float
));
else
{
int
j
;
for
(
j
=
0
;
j
<
d
;
j
++
)
dest
[
j
]
=
mul
*
src
[
j
];
}
}
static
inline
void
fast_add_vec
(
float
*
__restrict__
dest
,
const
float
*
__restrict__
add
,
int
d
,
float
mul
)
{
if
(
d
<=
4
)
{
int
j
;
for
(
j
=
0
;
j
<
d
;
j
++
)
dest
[
j
]
+=
mul
*
add
[
j
];
}
else
{
int
inc
=
1
;
saxpy_
(
&
d
,
&
mul
,
(
float
*
)
add
,
&
inc
,
(
float
*
)
dest
,
&
inc
);
}
}
static
inline
void
fast_div
(
float
*
__restrict__
a
,
long
d
,
const
float
div
)
{
const
float
divi
=
1
/
div
;
// assert( ((long)a & 15) == 0 && (d & 3) == 0 );
// const float _divi4[] = {divi,divi,divi,divi};
// __v4sf *a4 = (__v4sf*)a;
// __v4sf *divi4 = (__v4sf*)_divi4;
// int e = d>>2;
// while(e--) *a4++ *= (*divi4);
int
j
;
for
(
j
=
0
;
j
<
d
;
j
++
)
a
[
j
]
*=
divi
;
}
static
inline
float
*
fast_set_trans
(
float
*
dest
,
const
float
*
src
,
const
float
mul
,
int
dx
,
int
dy
,
const
int
tx
,
const
int
ty
,
const
int
ex
,
const
float
def
)
{
if
(
mul
==
0
)
{
memset
(
dest
,
0
,
sizeof
(
float
)
*
(
tx
+
ex
)
*
(
ty
+
ex
));
return
dest
+
(
tx
+
ex
)
*
(
ty
+
ex
);
}
if
(
dx
>
tx
)
dx
=
tx
;
// after those alues, nothing happens anyway
if
(
dy
>
ty
)
dy
=
ty
;
if
(
-
dx
>
tx
)
dx
=-
tx
;
if
(
-
dy
>
ty
)
dy
=-
ty
;
#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
float
*
_dest
=
dest
;
// paste -v zeros rows
if
(
dy
<
0
)
add_default
(
-
dy
*
(
tx
+
ex
));
src
+=
MAX
(
0
,
dx
);
const
int
row_len
=
MIN
(
tx
,
tx
+
dx
+
ex
)
-
MAX
(
0
,
dx
);
int
j
;
for
(
j
=
MAX
(
0
,
dy
);
j
<
MIN
(
ty
,
ty
+
dy
+
ex
);
j
++
)
{
// paste -u zeros cols
if
(
dx
<
0
)
add_default
(
-
dx
);
// past image
fast_set_vec
(
dest
,
src
+
j
*
tx
,
row_len
,
mul
);
dest
+=
row_len
;
// paste +u zeros cols
if
(
dx
>=
0
)
{
add_default
(
dx
)
if
(
ex
)
add_default
(
ex
)}
}
// paste +v zeros rows
if
(
dy
>=
0
){
add_default
(
dy
*
(
tx
+
ex
))
if
(
ex
)
add_default
(
ex
*
(
tx
+
ex
))}
#undef add_default
assert
(
dest
-
_dest
==
(
tx
+
ex
)
*
(
ty
+
ex
)
);
return
dest
;
}
static
inline
float
*
fast_add_trans
(
float
*
dest
,
const
float
*
src
,
const
float
mul
,
int
dx
,
int
dy
,
const
int
tx
,
const
int
ty
,
const
int
ex
,
const
float
def
)
{
if
(
mul
==
0
)
return
dest
+
(
tx
+
ex
)
*
(
ty
+
ex
);
if
(
dx
>
tx
)
dx
=
tx
;
// after those alues, nothing happens anyway
if
(
dy
>
ty
)
dy
=
ty
;
if
(
-
dx
>
tx
)
dx
=-
tx
;
if
(
-
dy
>
ty
)
dy
=-
ty
;
#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
float
*
_dest
=
dest
;
// paste -v zeros rows
if
(
dy
<
0
)
add_default
(
-
dy
*
(
tx
+
ex
));
src
+=
MAX
(
0
,
dx
);
const
int
row_len
=
MIN
(
tx
,
tx
+
dx
+
ex
)
-
MAX
(
0
,
dx
);
int
j
;
for
(
j
=
MAX
(
0
,
dy
);
j
<
MIN
(
ty
,
ty
+
dy
+
ex
);
j
++
)
{
// paste -u zeros cols
if
(
dx
<
0
)
add_default
(
-
dx
);
// past image
fast_add_vec
(
dest
,
src
+
j
*
tx
,
row_len
,
mul
);
dest
+=
row_len
;
// paste +u zeros cols
if
(
dx
>=
0
)
{
add_default
(
dx
)
if
(
ex
)
add_default
(
ex
)}
}
// paste +v zeros rows
if
(
dy
>=
0
){
add_default
(
dy
*
(
tx
+
ex
))
if
(
ex
)
add_default
(
ex
*
(
tx
+
ex
))}
#undef add_default
assert
(
dest
-
_dest
==
(
tx
+
ex
)
*
(
ty
+
ex
)
);
return
dest
;
}
static
inline
void
norm_norm
(
float
*
norms
,
int
nb
,
float
mode
)
{
int
i
;
if
(
mode
<
0
)
assert
(
!
"error: unknown norm mode"
);
else
if
(
mode
==
0.5
)
{
for
(
i
=
0
;
i
<
nb
;
i
++
)
norms
[
i
]
=
sqrt
(
sqrt
(
norms
[
i
]));
}
else
if
(
mode
<
1
)
{
mode
*=
0.5
;
// cumulate with initial 1/sqrt(.)
for
(
i
=
0
;
i
<
nb
;
i
++
)
norms
[
i
]
=
pow
(
norms
[
i
],
mode
);
}
else
if
(
mode
==
1
)
{
for
(
i
=
0
;
i
<
nb
;
i
++
)
norms
[
i
]
=
sqrt
(
norms
[
i
]);
}
else
if
(
mode
>
1
)
assert
(
!
"error: unknown norm mode"
);
}
/* normalize each pixel of a multi-layers image
norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)**<norm> }
*/
void
norm_layers
(
float_layers
*
res
,
float
norm
,
int
n_thread
)
{
if
(
norm
==
0
)
return
;
const
int
layer_size
=
res
->
tx
*
res
->
ty
;
const
int
n_layers
=
res
->
tz
;
float
*
norms
=
NEWAC
(
float
,
layer_size
);
long
l
;
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
r
=
res
->
pixels
+
l
*
layer_size
;
int
i
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
i
=
0
;
i
<
layer_size
;
i
++
)
norms
[
i
]
+=
r
[
i
]
*
r
[
i
];
}
norm_norm
(
norms
,
layer_size
,
norm
);
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
r
=
res
->
pixels
+
l
*
layer_size
;
int
i
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
i
=
0
;
i
<
layer_size
;
i
++
)
r
[
i
]
/=
norms
[
i
]
+
1e-8
;
}
free
(
norms
);
}
/* Return the vectorized dimension of a HOG patch
*/
int
get_patch_desc_dim
(
float_layers
*
hog
,
int
patch_size
)
{
return
patch_size
*
patch_size
*
hog
->
tz
;
// number of dimensions of an atomic patch descriptor
}
/* Sample a set of patches from a HOG image.
grid : array of (x,y) position of the patches
size: size of the patches, ie. [x,x+size[ x [y,y+size[
res: result array, n_patches x desc_dim
desc_dim = n_layers * size**2
norms: result, n_patches x 1, norm of each patch
*/
void
_sample_patches
(
float_layers
*
hog
,
float_layers
*
color
,
int_image
*
grid
,
int
size
,
float
norm
,
float_image
*
res
,
float_array
*
norms
,
int
n_thread
)
{
const
int
tx
=
hog
->
tx
;
const
long
npix
=
tx
*
hog
->
ty
;
assert
(
grid
->
tx
==
2
);
const
int
n_patches
=
grid
->
ty
;
assert
(
res
->
ty
==
n_patches
);
const
int
n_layers
=
hog
->
tz
;
const
int
n_colors
=
(
color
?
color
->
tz:
0
);
const
int
color_npix
=
(
color
?
color
->
tx
*
color
->
ty:
0
);
const
int
desc_size
=
size
*
size
*
n_layers
+
(
color
?
color
->
tz:
0
);
assert
(
res
->
tx
==
desc_size
);
int
n
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
n
=
0
;
n
<
n_patches
;
n
++
)
{
float
*
r
=
res
->
pixels
+
desc_size
*
n
;
int
*
p
=
grid
->
pixels
+
2
*
n
;
// copy hog
int
x
=
p
[
0
],
y
=
p
[
1
];
assert
(
0
<=
x
&&
x
+
size
<=
tx
);
assert
(
0
<=
y
&&
y
+
size
<=
hog
->
ty
);
int
l
,
j
;
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
h
=
hog
->
pixels
+
l
*
npix
+
y
*
tx
+
x
;
for
(
j
=
0
;
j
<
size
;
j
++
)
{
memcpy
(
r
,
h
,
size
*
sizeof
(
float
));
h
+=
tx
;
r
+=
size
;
}
}
if
(
!
color
)
continue
;
// copy color
float
*
c
=
color
->
pixels
+
(
y
+
size
/
2
)
*
color
->
ty
+
(
x
+
size
/
2
);
for
(
l
=
0
;
l
<
n_colors
;
l
++
)
*
r
++
=
c
[
l
*
color_npix
];
}
if
(
norm
)
{
float
*
normp
=
norms
?
norms
->
pixels
:
NEWAC
(
float
,
n_patches
);
if
(
norms
)
{
assert
(
norms
->
tx
==
n_patches
);
memset
(
normp
,
0
,
n_patches
*
sizeof
(
float
));
}
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
n
=
0
;
n
<
n_patches
;
n
++
)
{
float
*
r
=
res
->
pixels
+
desc_size
*
n
;
int
l
;
for
(
l
=
0
;
l
<
desc_size
;
l
++
)
normp
[
n
]
+=
r
[
l
]
*
r
[
l
];
}
norm_norm
(
normp
,
n_patches
,
norm
);
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
n
=
0
;
n
<
n_patches
;
n
++
)
{
float
*
r
=
res
->
pixels
+
desc_size
*
n
;
int
l
;
float
nn
=
normp
[
n
]
+
1e-8
;
for
(
l
=
0
;
l
<
desc_size
;
l
++
)
r
[
l
]
/=
nn
;
}
if
(
!
norms
)
free
(
normp
);
}
}
static
inline
int
retrieve_children
(
const
int
x
,
const
int
y
,
const
int_cube
*
child_grid
)
{
const
int
size0_div2
=
child_grid
->
pixels
[
0
];
const
int
step0
=
child_grid
->
tx
==
1
&&
child_grid
->
ty
==
1
?
1
:
MAX
(
child_grid
->
pixels
[
2
]
-
child_grid
->
pixels
[
0
],
child_grid
->
pixels
[
1
+
2
*
child_grid
->
tx
]
-
child_grid
->
pixels
[
1
]
);
int
i
=
(
x
-
size0_div2
)
/
step0
;
int
j
=
(
y
-
size0_div2
)
/
step0
;
assert
(
x
==
(
i
*
step0
+
size0_div2
)
||
!
"error: child_grid does not match current grid"
);
assert
(
y
==
(
j
*
step0
+
size0_div2
)
||
!
"error: child_grid does not match current grid"
);
if
(
i
<
0
||
i
>=
child_grid
->
tx
)
return
-
1
;
if
(
j
<
0
||
j
>=
child_grid
->
ty
)
return
-
1
;
return
i
+
j
*
child_grid
->
tx
;
}
/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
size = size of cells at current scale
offset, step = grid generator: (offset + i*step, offset + j*step)
child_grid = grid of the previous layer (or None if first layer)
child_norms = image containing the norms of the patch at the previous level
grid = result center positions of cells in current scale
children = index of cells in previous scale used to construct big cells
norms = norms of the cells of this level
*/
void
_prepare_big_cells
(
int
size
,
int
offset
,
int
step
,
int_cube
*
child_grid
,
float_image
*
child_norms
,
int_cube
*
grid
,
int_cube
*
children
,
float_image
*
norms
)
{
assert
(
grid
->
tz
==
2
);
const
int
ntx
=
grid
->
tx
;
// should be == 1+(tx-size)/step so that patches do not pass the border
const
int
nty
=
grid
->
ty
;
// should be == 1+(ty-size)/step so that patches do not pass the border
/* grid[i,j] = ( offset + i*step, offset + j*step )
connection between two scales:
x cell position in lower scale == x position of children in upper scale
child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4
*/
int
i
,
j
,
u
,
v
;
int
*
r
=
grid
->
pixels
;
if
(
!
child_grid
)
{
// this is the first scale:
// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
for
(
j
=
0
;
j
<
nty
;
j
++
)
for
(
i
=
0
;
i
<
ntx
;
i
++
)
{
*
r
++
=
offset
+
i
*
step
;
*
r
++
=
offset
+
j
*
step
;
}
}
else
{
assert
(
child_grid
->
tz
==
2
);
ASSERT_SAME_SIZE
(
child_grid
,
child_norms
);
assert
(
children
);
const
int
nc
=
sqrt
(
children
->
tz
);
// number of children per row or col
assert
(
children
->
tz
==
pow2
(
nc
)
);
ASSERT_SAME_SIZE
(
grid
,
children
);
ASSERT_SAME_SIZE
(
grid
,
norms
);
// this is at least second scale
// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
const
int
quarter
=
size
/
4
;
assert
(
4
*
quarter
==
size
);
int
*
c
=
children
->
pixels
;
float
*
n
=
norms
->
pixels
;
memset
(
n
,
0
,
ntx
*
nty
*
sizeof
(
float
));
for
(
j
=
0
;
j
<
nty
;
j
++
)
for
(
i
=
0
;
i
<
ntx
;
i
++
)
{
int
x
=
offset
+
i
*
step
;
int
y
=
offset
+
j
*
step
;
*
r
++
=
x
;
*
r
++
=
y
;
// accumulate norms from 2x2 or 3x3 neighbors
for
(
v
=
0
;
v
<
nc
;
v
++
)
for
(
u
=
0
;
u
<
nc
;
u
++
,
c
++
)
{
// we want to index the children at position:
// ( center_x + (2*u/(nc-1)-1)*size/4, center_y + (2*v/(nc-1)-1)*size/4 )
*
c
=
retrieve_children
(
x
+
(
2
*
u
/
(
nc
-
1
)
-
1
)
*
quarter
,
y
+
(
2
*
v
/
(
nc
-
1
)
-
1
)
*
quarter
,
child_grid
);
if
(
*
c
>=
0
)
*
n
+=
child_norms
->
pixels
[
*
c
];
}
n
++
;
}
}
}
/* Prepare image for dotprod : dot(patches, res)
where patches is n_patches x patch_dim
set outside of the image to be equal to (0,...,ninth_val)
*/
void
_prepare_dotprod_convolution
(
float_layers
*
img
,
int
patch_size
,
float
ninth_val
,
int
extend
,
float_layers
*
res
,
int
n_thread
)
{
assert
(
img
->
tx
+
extend
==
res
->
tx
);
assert
(
img
->
ty
+
extend
==
res
->
ty
);
const
int
n_layers
=
img
->
tz
;
const
int
tx
=
img
->
tx
;
const
int
ty
=
img
->
ty
;
const
int
npix
=
tx
*
ty
;
const
int
npixex
=
(
tx
+
extend
)
*
(
ty
+
extend
);
assert
(
res
->
tz
==
patch_size
*
patch_size
*
img
->
tz
);
long
l
;
const
int
first_half
=
patch_size
/
2
;
// half-size
const
int
second_half
=
patch_size
-
first_half
;
const
int
layer_size
=
patch_size
*
patch_size
*
npixex
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
img_pix
=
img
->
pixels
+
l
*
npix
;
float
*
r
=
res
->
pixels
+
l
*
layer_size
;
int
u
,
v
;
// copy translated version of the image into res
for
(
v
=-
first_half
;
v
<
second_half
;
v
++
)
for
(
u
=-
first_half
;
u
<
second_half
;
u
++
)
r
=
fast_set_trans
(
r
,
img_pix
,
1
,
u
,
v
,
tx
,
ty
,
extend
,
l
+
1
<
n_layers
?
0
:
ninth_val
);
}
}
float_layers
*
prepare_dotprod_convolution
(
float_layers
*
hog
,
int
patch_size
,
int
extend
,
float
norm
,
int
nt
)
{
assert
(
0
<=
extend
and
extend
<=
1
);
const
int
nh
=
get_patch_desc_dim
(
hog
,
patch_size
);
const
int
etx
=
hog
->
tx
+
extend
;
// extend a bit the image
const
int
ety
=
hog
->
ty
+
extend
;
float_layers
*
res
=
NEW
(
float_layers
);
*
res
=
empty_layers
(
float
,
etx
,
ety
,
nh
);
float
ninth_val
=
0
;
_prepare_dotprod_convolution
(
hog
,
patch_size
,
ninth_val
,
extend
,
res
,
nt
);
if
(
norm
)
norm_layers
(
res
,
norm
,
nt
);
return
res
;
}
inline
float
sum_array_f
(
const
float
*
a
,
int
n
)
{
int
i
=
n
;
double
res
=
0
;
while
(
i
--
)
res
+=
a
[
i
];
return
(
float
)
res
;
}
extern
"C"
{
int
sgemm_
(
char
*
transa
,
char
*
transb
,
integer
*
m
,
integer
*
n
,
integer
*
k
,
float
*
alpha
,
float
*
a
,
integer
*
lda
,
float
*
b
,
integer
*
ldb
,
float
*
beta
,
float
*
c
,
integer
*
ldc
);
}
/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
res = dot(patches, convolved_hog)
P*npix P * nh nh * npix
*/
void
_dotprod
(
float_image
*
patches
,
float_layers
*
convolved_hog
,
float_layers
*
res
,
int
n_thread
)
{
int
nh
=
patches
->
tx
;
assert
(
nh
==
convolved_hog
->
tz
);
ASSERT_SAME_IMG_SIZE
(
convolved_hog
,
res
);
int
P
=
patches
->
ty
;
assert
(
res
->
tz
==
P
);
int
threadP
=
1
+
(
P
-
1
)
/
n_thread
;
// how many patches per thread
int
npix
=
(
int
)
IMG_SIZE
(
convolved_hog
);
int
l
;
#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
#pragma omp parallel for num_threads(n_thread)
#else
n_thread
=
1
;
// BLAS is already multithreaded
threadP
=
P
;
#endif
for
(
l
=
0
;
l
<
n_thread
;
l
++
)
{
// we do dotprod( patches[l*threadP : (l+1)*threadP], convolved_hog )
long
start
=
l
*
threadP
;
long
end
=
MIN
(
P
,(
l
+
1
)
*
threadP
);
int
np
=
int
(
end
-
start
);
float
*
p
=
patches
->
pixels
+
nh
*
start
;
float
*
r
=
res
->
pixels
+
npix
*
start
;
// blas fast matrix-matrix product
char
T
=
'n'
;
float
alpha
=
1
,
beta
=
0
;
sgemm_
(
&
T
,
&
T
,
&
npix
,
&
np
,
&
nh
,
&
alpha
,
convolved_hog
->
pixels
,
&
npix
,
p
,
&
nh
,
&
beta
,
r
,
&
npix
);
}
}
inline
void
transpose_scalar_block
(
const
float
*
A
,
float
*
B
,
const
int
lda
,
const
int
ldb
,
const
int
block_row
,
const
int
block_col
)
{
for
(
int
i
=
0
;
i
<
block_row
;
i
++
)
for
(
int
j
=
0
;
j
<
block_col
;
j
++
)
B
[
j
*
ldb
+
i
]
=
A
[
i
*
lda
+
j
];
}
// Transpose A (N rows by M cols) into B (M by N)
void
transpose_matrix
(
const
float_image
*
A
,
float_image
*
B
,
int
nt
)
{
const
int
n
=
A
->
ty
,
m
=
A
->
tx
;
assert
(
n
==
B
->
tx
&&
m
==
B
->
ty
);
const
int
block_size
=
16
;
const
float
*
pA
=
A
->
pixels
;
float
*
pB
=
B
->
pixels
;
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(nt)
#endif
for
(
int
i
=
0
;
i
<
n
;
i
+=
block_size
)
for
(
int
j
=
0
;
j
<
m
;
j
+=
block_size
)
transpose_scalar_block
(
&
pA
[
i
*
m
+
j
],
&
pB
[
j
*
n
+
i
],
m
,
n
,
MIN
(
block_size
,
n
-
i
),
MIN
(
block_size
,
m
-
j
));
}
extern
"C"
{
int
sgemv_
(
char
*
transa
,
integer
*
m
,
integer
*
n
,
float
*
alpha
,
float
*
a
,
integer
*
lda
,
float
*
b
,
integer
*
ldb
,
float
*
beta
,
float
*
c
,
integer
*
ldc
);
}
/* convolution of each patch within a local neighborhood
ngh_rad = max translation
neighborhood has size 2*ngh_rad
patch at (x,y) is compared to patches in [y-ngh_rad : y+ngh_rad,
x-ngh_rad : y+ngh_rad]
*/
void
_dotprod_ngh_rad_T
(
int_cube
*
grid
,
float_image
*
patches
,
int
ngh_rad
,
float_cube
*
convolved_hog
,
float_layers
*
res_out
,
int_image
*
offsets
,
int
n_thread
)
{
int
nh
=
patches
->
tx
;
assert
(
nh
==
convolved_hog
->
tz
);
const
int
P
=
patches
->
ty
;
assert
(
IMG_SIZE
(
grid
)
==
P
&&
grid
->
tz
==
2
);
const
int
tx
=
convolved_hog
->
tx
;
const
int
ty
=
convolved_hog
->
ty
;
// neighborhood size
int
res_tx
=
MIN
(
tx
,
2
*
ngh_rad
);
int
res_ty
=
MIN
(
ty
,
2
*
ngh_rad
);
assert
(
res_tx
<
tx
-
1
||
res_ty
<
ty
-
1
||
!
"ngh_rad is too large and results in loss of perf. Set ngh_rad=0 instead."
);
int
res_npix
=
res_tx
*
res_ty
;
// allocate result
*
res_out
=
empty_layers
(
float
,
res_tx
,
res_ty
,
P
);
assert
(
res_out
->
pixels
||
!
"error: ran out of memory before sgemm"
);
*
offsets
=
empty_image
(
int
,
2
,
P
);
char
T
=
't'
;
float
alpha
=
1
,
beta
=
0
;
int
one
=
1
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
int
j
=
0
;
j
<
res_ty
;
++
j
)
{
// By organizing loops this way,
// we exploit overlap between patches.
for
(
int
l
=
0
;
l
<
P
;
l
++
)
{
float
*
p
=
patches
->
pixels
+
l
*
nh
;
float
*
r
=
res_out
->
pixels
+
l
*
res_npix
;
int
left
=
MAX
(
0
,
MIN
(
grid
->
pixels
[
2
*
l
+
0
]
-
ngh_rad
,
tx
-
2
*
ngh_rad
));
int
top
=
MAX
(
0
,
MIN
(
grid
->
pixels
[
2
*
l
+
1
]
-
ngh_rad
,
ty
-
2
*
ngh_rad
));
if
(
j
==
0
)
{
offsets
->
pixels
[
2
*
l
+
0
]
=
left
;
offsets
->
pixels
[
2
*
l
+
1
]
=
top
;
}
float
*
c
=
convolved_hog
->
pixels
+
(
left
+
top
*
tx
)
*
nh
;
// blas fast matrix-vector product
sgemv_
(
&
T
,
&
nh
,
&
res_tx
,
&
alpha
,
c
+
j
*
tx
*
nh
,
&
nh
,
p
,
&
one
,
&
beta
,
r
+
j
*
res_tx
,
&
one
);
}
}
}
/* correct the convolution on the boundaries of the image
ttx, tty: true shape of the res_map (in case of using offsets)
*/
void
rectify_conv
(
int
patch_size
,
int
nori
,
float_image
*
patches
,
int_image
*
offsets
,
const
int
ttx
,
const
int
tty
,
int
extend
,
float_layers
*
res
,
int
n_thread
)
{
const
int
n_patches
=
patches
->
ty
;
assert
(
n_patches
==
res
->
tz
);
//const int nori = patches->tx/pow2(patch_size);
assert
(
patches
->
tx
>=
nori
*
pow2
(
patch_size
)
);
const
int
tx
=
res
->
tx
;
// real true shape because it has been extended
const
int
ty
=
res
->
ty
;
const
int
first_half
=
patch_size
/
2
;
const
int
second_half
=
patch_size
-
first_half
;
// in case patch_size is odd
assert
(
offsets
||
(
ttx
==
tx
&&
tty
==
ty
)
);
assert
(
!
offsets
||
(
ttx
>=
tx
&&
tty
>=
ty
)
);
assert
(
!
offsets
||
(
offsets
->
ty
==
res
->
tz
&&
offsets
->
tx
==
2
)
);
const
long
npix
=
IMG_SIZE
(
res
);
int
l
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
l
=
0
;
l
<
n_patches
;
l
++
)
{
// load offsets
const
int
offi
=
offsets
?
offsets
->
pixels
[
2
*
l
+
0
]
:
0
;
const
int
offj
=
offsets
?
offsets
->
pixels
[
2
*
l
+
1
]
:
0
;
float
sums
[
8
];
// temporary norm of columns or rows
assert
(
patch_size
<=
(
int
)(
sizeof
(
sums
)
/
sizeof
(
sums
[
0
]))
);
int
o
,
i
,
j
;
// horizontal boundaries
memset
(
sums
,
0
,
sizeof
(
sums
));
float
*
p
=
patches
->
pixels
+
l
*
patches
->
tx
;
for
(
o
=
0
;
o
<
nori
;
o
++
)
for
(
j
=
0
;
j
<
patch_size
;
j
++
)
for
(
i
=
0
;
i
<
patch_size
;
i
++
)
sums
[
j
]
+=
pow2
(
*
p
++
);
float
old_norm
=
sqrt
(
sum_array_f
(
sums
,
patch_size
));
if
(
old_norm
==
0
)
continue
;
// upper boundary
for
(
j
=
offj
;
j
<
first_half
;
j
++
)
{
float
new_norm
=
sqrt
(
sum_array_f
(
sums
+
(
first_half
-
j
),
second_half
+
j
));
// sums to patch_size
float
mul
=
old_norm
/
(
new_norm
+
1e-8
);
float
*
r
=
res
->
pixels
+
l
*
npix
+
(
j
-
offj
)
*
tx
;
for
(
i
=
0
;
i
<
tx
;
i
++
)
{
r
[
i
]
*=
mul
;
//assert(r[i]<1.1);
}
}
// lower boundary
for
(
j
=
tty
-
extend
+
1
-
second_half
;
j
<
offj
+
ty
;
j
++
)
{
float
new_norm
=
sqrt
(
sum_array_f
(
sums
,
first_half
+
tty
-
extend
-
j
));
// sums to patch_size
float
mul
=
old_norm
/
(
new_norm
+
1e-8
);
float
*
r
=
res
->
pixels
+
l
*
npix
+
(
j
-
offj
)
*
tx
;
for
(
i
=
0
;
i
<
tx
;
i
++
)
{
r
[
i
]
*=
mul
;
//assert(r[i]<1.1);
}
}
// vertical boundaries
memset
(
sums
,
0
,
sizeof
(
sums
));
p
=
patches
->
pixels
+
l
*
patches
->
tx
;
for
(
o
=
0
;
o
<
nori
;
o
++
)
for
(
j
=
0
;
j
<
patch_size
;
j
++
)
for
(
i
=
0
;
i
<
patch_size
;
i
++
)
sums
[
i
]
+=
pow2
(
*
p
++
);
// left boundary
for
(
i
=
offi
;
i
<
first_half
;
i
++
)
{
float
new_norm
=
sqrt
(
sum_array_f
(
sums
+
(
first_half
-
i
),
second_half
+
i
));
float
mul
=
old_norm
/
(
new_norm
+
1e-8
);
float
*
r
=
res
->
pixels
+
l
*
npix
+
(
i
-
offi
);
for
(
j
=
0
;
j
<
ty
;
j
++
)
{
r
[
j
*
tx
]
*=
mul
;
//assert(r[j*tx]<1.1);
}
}
// right boundary
for
(
i
=
ttx
-
extend
+
1
-
second_half
;
i
<
offi
+
tx
;
i
++
)
{
float
new_norm
=
sqrt
(
sum_array_f
(
sums
,
first_half
+
ttx
-
extend
-
i
));
float
mul
=
old_norm
/
(
new_norm
+
1e-8
);
float
*
r
=
res
->
pixels
+
l
*
npix
+
(
i
-
offi
);
for
(
j
=
0
;
j
<
ty
;
j
++
)
{
r
[
j
*
tx
]
*=
mul
;
//assert(r[j*tx]<1.1);
}
}
// because we over-estimated the rectification for the corners, check that they do not overpass old_norm
float
*
r
=
res
->
pixels
+
l
*
npix
;
for
(
j
=
offj
;
j
<
first_half
;
j
++
)
{
for
(
i
=
offi
;
i
<
first_half
;
i
++
)
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)]
=
MIN
(
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)],
old_norm
);
for
(
i
=
ttx
-
extend
+
1
-
second_half
;
i
<
offi
+
tx
;
i
++
)
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)]
=
MIN
(
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)],
old_norm
);
}
for
(
j
=
tty
-
extend
+
1
-
second_half
;
j
<
offj
+
ty
;
j
++
)
{
for
(
i
=
offi
;
i
<
first_half
;
i
++
)
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)]
=
MIN
(
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)],
old_norm
);
for
(
i
=
ttx
-
extend
+
1
-
second_half
;
i
<
offi
+
tx
;
i
++
)
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)]
=
MIN
(
r
[(
j
-
offj
)
*
tx
+
(
i
-
offi
)],
old_norm
);
}
}
}
/* Compute the correlation of all patches with the second image (hog).
In case of ngh_rad>0, the correlation is only computed in a small local neighborhood
(whose size is parameterized by ngh_rad).
if extend: width and height of output maps are extended
if norm: correlation are normalized afterwards.
*/
void
fastconv
(
float_image
*
patches
,
float_layers
*
hog
,
int
patch_size
,
int
ngh_rad
,
int
extend
,
float
norm
,
int
nt
,
res_scale
*
res
)
{
assert
(
0
<=
extend
and
extend
<=
1
);
float_layers
*
convolved_hog
=
prepare_dotprod_convolution
(
hog
,
patch_size
,
extend
,
norm
,
nt
);
assert
(
patches
->
tx
==
convolved_hog
->
tz
);
res
->
true_shape
[
0
]
=
convolved_hog
->
tx
;
res
->
true_shape
[
1
]
=
convolved_hog
->
ty
;
//hash_layers(convolved_hog)
int_image
*
offsets
=
NULL
;
if
(
ngh_rad
==
0
)
{
// no limit on translation
// allocate result
res
->
res_map
=
empty_layers
(
float
,
convolved_hog
->
tx
,
convolved_hog
->
ty
,
patches
->
ty
);
assert
(
res
->
res_map
.
pixels
||
!
"error: ran out of memory before sgemm"
);
// multi-threaded fast matrix product
_dotprod
(
patches
,
convolved_hog
,
&
res
->
res_map
,
nt
);
}
else
{
// ngh_rad>0: cropping res_map
offsets
=
&
res
->
offsets
;
// transpose hog: _dotprod is much faster this way
float_cube
convolved_hog_T
=
empty_cube
(
float
,
convolved_hog
->
tx
,
convolved_hog
->
ty
,
convolved_hog
->
tz
);
{
float_image
A
=
reshape_xy_z
(
float
,
convolved_hog
);
// cast to 2D matrix without copy
float_image
B
=
reshape_z_xy
(
float
,
&
convolved_hog_T
);
transpose_matrix
(
&
A
,
&
B
,
nt
);
}
//hash_cube(&convolved_hog_T)
// resized grid
int_cube
fgrid
=
cube_like
(
int
,
&
res
->
grid
);
for
(
int
i
=
0
;
i
<
CUBE_SIZE
(
&
fgrid
);
i
++
)
fgrid
.
pixels
[
i
]
=
res
->
grid
.
pixels
[
i
]
/
res
->
f
;
//hash_cube(&fgrid)
// multi-threaded fast matrix product
_dotprod_ngh_rad_T
(
&
fgrid
,
patches
,
ngh_rad
,
&
convolved_hog_T
,
&
res
->
res_map
,
offsets
,
nt
);
free
(
fgrid
.
pixels
);
free
(
convolved_hog_T
.
pixels
);
//hash_image(offsets)
}
free_layers
(
convolved_hog
);
// correct border effects on the correlation maps
rectify_conv
(
patch_size
,
hog
->
tz
,
patches
,
offsets
,
res
->
true_shape
[
0
],
res
->
true_shape
[
1
],
extend
,
&
res
->
res_map
,
nt
);
}
/* Compute: arr **= p
*/
void
fastipow
(
float_layers
*
arr
,
const
float
p
,
int
n_thread
)
{
const
int
n_layers
=
arr
->
tz
;
const
long
npix
=
arr
->
tx
*
arr
->
ty
;
int
l
;
// optimization: precompute some values of pow(x,p)
const
int
npc
=
64
;
float
precom
[
npc
+
1
];
for
(
l
=
0
;
l
<=
npc
;
l
++
)
precom
[
l
]
=
pow
(
l
/
(
float
)
npc
,
p
);
const
float
maxindex
=
npc
-
0.001
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
a
=
arr
->
pixels
+
l
*
npix
;
int
i
;
for
(
i
=
0
;
i
<
npix
;
i
++
)
{
// arr[i] = pow(arr[i],p);
float
v
=
a
[
i
]
*
npc
;
assert
(
v
>=
0
&&
v
<
npc
+
1
);
if
(
v
>
maxindex
)
v
=
maxindex
;
int
n
=
int
(
v
);
float
w
=
v
-
n
;
a
[
i
]
=
(
1
-
w
)
*
precom
[
n
]
+
w
*
precom
[
n
+
1
];
}
}
}
/* Compute: arr = max(0,(arr-p)/(1-p))
*/
void
fasthinge
(
float_layers
*
arr
,
const
float
p
,
int
n_thread
)
{
const
int
n_layers
=
arr
->
tz
;
const
long
npix
=
arr
->
tx
*
arr
->
ty
;
int
l
;
const
float
f
=
1
/
(
1
-
p
);
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
l
=
0
;
l
<
n_layers
;
l
++
)
{
float
*
a
=
arr
->
pixels
+
l
*
npix
;
int
i
;
for
(
i
=
0
;
i
<
npix
;
i
++
)
{
float
v
=
a
[
i
];
a
[
i
]
=
MAX
(
0
,
f
*
(
v
-
p
));
}
}
}
inline
int
max_array_i
(
const
int
*
a
,
int
n
)
{
int
i
=
n
;
int
res
=
INT_MIN
;
while
(
i
--
)
if
(
a
[
i
]
>
res
)
res
=
a
[
i
];
return
res
;
}
/* Normalize weights in border areas of width <gap>.
There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
sum_divf indicates the current weight in those areas, i.e. values in the area
should be divided by the weight. But trans_inv allow to control the amount of
normalization: 0=no normalization, 1=normal
*/
static
inline
void
normalize_trans
(
const
int
tx
,
const
int
ty
,
const
int
gap
,
float
*
rmap
,
const
float
trans_inv
,
float
sum_divf
[
9
]
)
{
if
(
trans_inv
==
0
)
return
;
int
i
,
j
;
for
(
i
=
0
;
i
<
9
;
i
++
)
{
if
(
sum_divf
[
i
]
>
0
)
sum_divf
[
i
]
=
1
/
pow
(
sum_divf
[
i
],
trans_inv
);
// if trans_inv==1, no effect
}
for
(
j
=
0
;
j
<
gap
;
j
++
)
{
if
(
sum_divf
[
0
])
for
(
i
=
0
;
i
<
gap
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
0
];
if
(
sum_divf
[
1
])
for
(
i
=
gap
;
i
<
tx
-
gap
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
1
];
if
(
sum_divf
[
2
])
for
(
i
=
tx
-
gap
;
i
<
tx
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
2
];
}
for
(;
j
<
ty
-
gap
;
j
++
)
{
if
(
sum_divf
[
3
])
for
(
i
=
0
;
i
<
gap
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
3
];
if
(
sum_divf
[
5
])
for
(
i
=
tx
-
gap
;
i
<
tx
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
5
];
}
for
(;
j
<
ty
;
j
++
)
{
if
(
sum_divf
[
6
])
for
(
i
=
0
;
i
<
gap
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
6
];
if
(
sum_divf
[
7
])
for
(
i
=
gap
;
i
<
tx
-
gap
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
7
];
if
(
sum_divf
[
8
])
for
(
i
=
tx
-
gap
;
i
<
tx
;
i
++
)
rmap
[
j
*
tx
+
i
]
*=
sum_divf
[
8
];
}
}
/* Compute the (sparse) convolutions specified by <children> on <map> and put the result in <res>.
A standard order is assumed on the children:
a response map #p is built from the children[p] at positions
[(gap*dx,gap*dy) for dy in dys for dx in dxs]
where dxs = [-1,1] or [-1,0,1]
dys = [-1,1] or [-1,0,1]
child_assign denote assignement of the children level, while assign is for the next level
child_norms contain the norms of small patches and norms for big new cells
*/
int
_sparse_conv
(
int_image
*
children
,
int_array
*
child_assign
,
int
gap
,
float
trans_inv
,
float_layers
*
child_map
,
int_image
*
offsets
,
float_array
*
child_norms
,
float_array
*
norms
,
int_array
*
assign
,
float_layers
*
res
,
int_image
*
res_offsets
,
int
n_thread
)
{
const
int
nconv
=
children
->
ty
;
// number of convolutions to perform
const
int
nc2
=
children
->
tx
;
const
int
nc
=
sqrt
(
nc2
);
assert
(
nc
*
nc
==
nc2
);
assert
(
res
->
tz
==
nconv
);
const
int
tx
=
child_map
->
tx
;
const
int
ty
=
child_map
->
ty
;
const
long
npix
=
tx
*
ty
;
ASSERT_SAME_SIZE
(
child_map
,
res
);
const
int
n_lower_conv
=
max_array_i
(
children
->
pixels
,
nconv
*
nc2
)
+
1
;
int
*
cass
=
child_assign
?
child_assign
->
pixels
:
NEWA
(
int
,
n_lower_conv
);
if
(
!
child_assign
)
{
for
(
int
i
=
0
;
i
<
n_lower_conv
;
i
++
)
cass
[
i
]
=
i
;}
assert
(
!
offsets
||
(
offsets
->
pixels
&&
offsets
->
tx
==
2
&&
offsets
->
ty
==
n_lower_conv
&&
res_offsets
&&
res_offsets
->
tx
==
2
&&
res_offsets
->
ty
==
nconv
)
);
if
(
assign
)
{
assert
(
0
);
// not supposed to happen
}
else
{
// normal case: no redundancy to exploit in response maps
int
l
;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for
(
l
=
0
;
l
<
nconv
;
l
++
)
{
float
*
rmap
=
res
->
pixels
+
l
*
npix
;
int
u
,
v
,
c
,
ncall
=
0
;
// children number
const
int
*
const
child
=
children
->
pixels
+
l
*
nc2
;
float
sum_divf
[
9
];
memset
(
sum_divf
,
0
,
sizeof
(
sum_divf
));
int
i
,
j
;
// first, choose an offset for the result rmap from the child offsets
int
offx
=
0
,
offy
=
0
;
if
(
offsets
)
{
int
sum_ox
=
0
,
sum_oy
=
0
,
w
=
0
;
for
(
c
=
v
=
0
;
v
<
nc
;
v
++
)
{
int
dy
=
(
2
*
v
/
(
nc
-
1
)
-
1
);
for
(
u
=
0
;
u
<
nc
;
u
++
,
c
++
)
{
int
dx
=
(
2
*
u
/
(
nc
-
1
)
-
1
);
if
(
child
[
c
]
<
0
||
cass
[
child
[
c
]]
<
0
)
continue
;
sum_ox
+=
offsets
->
pixels
[
2
*
child
[
c
]
+
0
]
-
dx
*
gap
;
sum_oy
+=
offsets
->
pixels
[
2
*
child
[
c
]
+
1
]
-
dy
*
gap
;
w
++
;
}
}
if
(
w
==
0
)
w
++
;
// just in case
offx
=
(
int
)
floor
(
0.5
+
sum_ox
/
float
(
w
));
offy
=
(
int
)
floor
(
0.5
+
sum_oy
/
float
(
w
));
// store result for later
res_offsets
->
pixels
[
2
*
l
+
0
]
=
offx
;
res_offsets
->
pixels
[
2
*
l
+
1
]
=
offy
;
}
for
(
c
=
v
=
0
;
v
<
nc
;
v
++
)
{
int
dy
=
(
2
*
v
/
(
nc
-
1
)
-
1
);
for
(
u
=
0
;
u
<
nc
;
u
++
,
c
++
)
{
int
dx
=
(
2
*
u
/
(
nc
-
1
)
-
1
);
if
(
child
[
c
]
<
0
||
cass
[
child
[
c
]]
<
0
)
continue
;
float
divf
=
child_norms
->
pixels
[
child
[
c
]]
/
norms
->
pixels
[
l
];
// difference with rmap's offset
const
int
trans_x
=
dx
*
gap
+
(
offsets
?
offx
-
offsets
->
pixels
[
2
*
child
[
c
]
+
0
]
:
0
);
const
int
trans_y
=
dy
*
gap
+
(
offsets
?
offy
-
offsets
->
pixels
[
2
*
child
[
c
]
+
1
]
:
0
);
// count the sum of weights in every image area
for
(
i
=-
1
;
i
<=
1
;
i
++
)
for
(
j
=-
1
;
j
<=
1
;
j
++
)
if
(
i
*
trans_x
<=
0
&&
j
*
trans_y
<=
0
)
sum_divf
[
4
+
j
*
3
+
i
]
+=
divf
;
// add a translated version of map[children[c]] by (ox-dx,oy-dy)
if
(
ncall
++==
0
)
// first call
fast_set_trans
(
rmap
,
child_map
->
pixels
+
cass
[
child
[
c
]]
*
npix
,
divf
,
trans_x
,
trans_y
,
tx
,
ty
,
0
,
0
);
else
fast_add_trans
(
rmap
,
child_map
->
pixels
+
cass
[
child
[
c
]]
*
npix
,
divf
,
trans_x
,
trans_y
,
tx
,
ty
,
0
,
0
);
}
}
if
(
ncall
==
0
)
// default = zeros
memset
(
rmap
,
0
,
npix
*
sizeof
(
float
));
// now we are supposed to rectify the boundaries (to perfect convolution)
normalize_trans
(
tx
,
ty
,
gap
,
rmap
,
trans_inv
,
sum_divf
);
//assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001);
}
}
if
(
!
child_assign
)
free
(
cass
);
#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)
//CHECK_MAPS(res);
return
nconv
;
}
Event Timeline
Log In to Comment