Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91426589
conv.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Nov 11, 00:14
Size
31 KB
Mime Type
text/x-c
Expires
Wed, Nov 13, 00:14 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22261587
Attached To
R6289 Motion correction paper
conv.cpp
View Options
/*
Copyright (C) 2014 Jerome Revaud
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
*/
#include "conv.h"
#include "std.h"
#include "omp.h"
#include "maxfilter.h"
extern "C" {
#include <immintrin.h>
#define integer int
#define real float
extern int saxpy_(integer *n, real *sa, real *sx, integer *incx, real *sy, integer *incy);
extern int sscal_(integer *n, real *sa, real *sx, integer *incx);
}
static inline void fast_set_val( float * __restrict__ a, long d, const float val) {
if(val) {
int j;
for(j=0; j<d; j++)
a[j] = val;
} else
memset(a,0,d*sizeof(float));
}
static inline void fast_add_val( float * __restrict__ a, long d, const float val) {
int j;
for(j=0; j<d; j++)
a[j] += val;
}
static inline void fast_set_vec( float * __restrict__ dest,
const float * __restrict__ src, int d, const float mul) {
if( mul==1)
memcpy(dest,src,d*sizeof(float));
else {
int j;
for(j=0; j<d; j++)
dest[j] = mul*src[j];
}
}
static inline void fast_add_vec( float * __restrict__ dest,
const float * __restrict__ add, int d, float mul) {
if(d<=4) {
int j;
for(j=0; j<d; j++)
dest[j] += mul*add[j];
} else {
int inc = 1;
saxpy_( &d, &mul, (float*)add, &inc, (float*)dest, &inc );
}
}
static inline void fast_div( float * __restrict__ a, long d, const float div) {
const float divi = 1/div;
// assert( ((long)a & 15) == 0 && (d & 3) == 0 );
// const float _divi4[] = {divi,divi,divi,divi};
// __v4sf *a4 = (__v4sf*)a;
// __v4sf *divi4 = (__v4sf*)_divi4;
// int e = d>>2;
// while(e--) *a4++ *= (*divi4);
int j;
for(j=0; j<d; j++)
a[j] *= divi;
}
static inline float* fast_set_trans( float * dest, const float * src, const float mul,
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
if(mul==0) {
memset(dest,0,sizeof(float)*(tx+ex)*(ty+ex));
return dest+(tx+ex)*(ty+ex);
}
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
// paste -u zeros cols
if(dx<0) add_default(-dx);
// past image
fast_set_vec(dest,src+j*tx,row_len,mul);
dest += row_len;
// paste +u zeros cols
if(dx>=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline float* fast_add_trans( float * dest, const float * src, const float mul,
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
if(mul==0) return dest+(tx+ex)*(ty+ex);
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
// paste -u zeros cols
if(dx<0) add_default(-dx);
// past image
fast_add_vec(dest,src+j*tx,row_len,mul);
dest += row_len;
// paste +u zeros cols
if(dx>=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline void norm_norm( float* norms, int nb, float mode ) {
int i;
if( mode < 0 )
assert(!"error: unknown norm mode");
else if( mode == 0.5 ) {
for(i=0; i<nb; i++)
norms[i] = sqrt(sqrt(norms[i]));
} else if( mode < 1 ) {
mode *= 0.5; // cumulate with initial 1/sqrt(.)
for(i=0; i<nb; i++)
norms[i] = pow(norms[i], mode);
} else if( mode == 1 ) {
for(i=0; i<nb; i++)
norms[i] = sqrt(norms[i]);
} else if( mode > 1 )
assert(!"error: unknown norm mode");
}
/* normalize each pixel of a multi-layers image
norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)**<norm> }
*/
void norm_layers( float_layers* res, float norm, int n_thread ) {
if(norm==0) return;
const int layer_size = res->tx*res->ty;
const int n_layers = res->tz;
float* norms = NEWAC(float,layer_size);
long l;
for(l=0; l<n_layers; l++) {
float* r = res->pixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; i<layer_size; i++)
norms[i] += r[i]*r[i];
}
norm_norm( norms, layer_size, norm );
for(l=0; l<n_layers; l++) {
float* r = res->pixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; i<layer_size; i++)
r[i] /= norms[i]+1e-8;
}
free(norms);
}
/* Return the vectorized dimension of a HOG patch
*/
int get_patch_desc_dim( float_layers* hog, int patch_size )
{
return patch_size*patch_size * hog->tz; // number of dimensions of an atomic patch descriptor
}
/* Sample a set of patches from a HOG image.
grid : array of (x,y) position of the patches
size: size of the patches, ie. [x,x+size[ x [y,y+size[
res: result array, n_patches x desc_dim
desc_dim = n_layers * size**2
norms: result, n_patches x 1, norm of each patch
*/
void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm,
float_image* res, float_array* norms, int n_thread ) {
const int tx = hog->tx;
const long npix = tx*hog->ty;
assert( grid->tx == 2 );
const int n_patches = grid->ty;
assert( res->ty == n_patches );
const int n_layers = hog->tz;
const int n_colors = (color? color->tz: 0);
const int color_npix = (color? color->tx*color->ty: 0);
const int desc_size = size*size*n_layers + (color? color->tz: 0);
assert(res->tx == desc_size );
int n;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int *p = grid->pixels + 2*n;
// copy hog
int x=p[0],y=p[1];
assert(0<=x && x+size<=tx);
assert(0<=y && y+size<=hog->ty);
int l,j;
for(l=0; l<n_layers; l++) {
float* h = hog->pixels + l*npix + y*tx + x;
for(j=0; j<size; j++) {
memcpy(r, h, size*sizeof(float));
h += tx;
r += size;
}
}
if(!color) continue;
// copy color
float* c = color->pixels + (y+size/2)*color->ty + (x+size/2);
for(l=0; l<n_colors; l++)
*r++ = c[l*color_npix];
}
if(norm) {
float* normp = norms ? norms->pixels : NEWAC(float, n_patches);
if(norms) {
assert(norms->tx==n_patches);
memset(normp,0,n_patches*sizeof(float));
}
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int l;
for(l=0; l<desc_size; l++)
normp[n] += r[l]*r[l];
}
norm_norm( normp, n_patches, norm );
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int l;
float nn = normp[n]+1e-8;
for(l=0; l<desc_size; l++)
r[l] /= nn;
}
if(!norms) free(normp);
}
}
static inline int retrieve_children( const int x, const int y, const int_cube* child_grid ) {
const int size0_div2 = child_grid->pixels[0];
const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 :
MAX( child_grid->pixels[2]-child_grid->pixels[0],
child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] );
int i = (x-size0_div2)/step0;
int j = (y-size0_div2)/step0;
assert( x==(i*step0+size0_div2) || !"error: child_grid does not match current grid" );
assert( y==(j*step0+size0_div2) || !"error: child_grid does not match current grid" );
if( i<0 || i>=child_grid->tx ) return -1;
if( j<0 || j>=child_grid->ty ) return -1;
return i+j*child_grid->tx;
}
/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
size = size of cells at current scale
offset, step = grid generator: (offset + i*step, offset + j*step)
child_grid = grid of the previous layer (or None if first layer)
child_norms = image containing the norms of the patch at the previous level
grid = result center positions of cells in current scale
children = index of cells in previous scale used to construct big cells
norms = norms of the cells of this level
*/
void _prepare_big_cells( int size, int offset, int step,
int_cube* child_grid, float_image* child_norms,
int_cube* grid, int_cube* children, float_image* norms ) {
assert(grid->tz==2);
const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border
const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border
/* grid[i,j] = ( offset + i*step, offset + j*step )
connection between two scales:
x cell position in lower scale == x position of children in upper scale
child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4
*/
int i,j,u,v;
int* r = grid->pixels;
if( !child_grid ) {
// this is the first scale:
// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
for(j=0; j<nty; j++)
for(i=0; i<ntx; i++) {
*r++ = offset + i*step;
*r++ = offset + j*step;
}
} else {
assert(child_grid->tz==2);
ASSERT_SAME_SIZE( child_grid, child_norms );
assert( children );
const int nc = sqrt(children->tz); // number of children per row or col
assert( children->tz==pow2(nc) );
ASSERT_SAME_SIZE( grid, children );
ASSERT_SAME_SIZE( grid, norms );
// this is at least second scale
// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
const int quarter = size/4;
assert(4*quarter==size);
int* c = children->pixels;
float *n = norms->pixels;
memset(n,0,ntx*nty*sizeof(float));
for(j=0; j<nty; j++)
for(i=0; i<ntx; i++) {
int x = offset + i*step;
int y = offset + j*step;
*r++ = x;
*r++ = y;
// accumulate norms from 2x2 or 3x3 neighbors
for(v=0; v<nc; v++)
for(u=0; u<nc; u++,c++) {
// we want to index the children at position:
// ( center_x + (2*u/(nc-1)-1)*size/4, center_y + (2*v/(nc-1)-1)*size/4 )
*c = retrieve_children( x+(2*u/(nc-1)-1)*quarter, y+(2*v/(nc-1)-1)*quarter, child_grid );
if(*c>=0) *n += child_norms->pixels[*c];
}
n++;
}
}
}
/* Prepare image for dotprod : dot(patches, res)
where patches is n_patches x patch_dim
set outside of the image to be equal to (0,...,ninth_val)
*/
void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend,
float_layers* res, int n_thread ) {
assert( img->tx+extend == res->tx );
assert( img->ty+extend == res->ty );
const int n_layers = img->tz;
const int tx = img->tx;
const int ty = img->ty;
const int npix = tx*ty;
const int npixex = (tx+extend)*(ty+extend);
assert( res->tz==patch_size*patch_size*img->tz );
long l;
const int first_half = patch_size/2; // half-size
const int second_half = patch_size - first_half;
const int layer_size = patch_size*patch_size*npixex;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* img_pix = img->pixels + l*npix;
float* r = res->pixels + l*layer_size;
int u,v;
// copy translated version of the image into res
for(v=-first_half; v<second_half; v++)
for(u=-first_half; u<second_half; u++)
r = fast_set_trans( r, img_pix, 1, u, v, tx, ty, extend, l+1<n_layers? 0 : ninth_val );
}
}
float_layers* prepare_dotprod_convolution( float_layers* hog, int patch_size, int extend, float norm, int nt )
{
assert(0<=extend and extend<=1);
const int nh = get_patch_desc_dim(hog,patch_size);
const int etx = hog->tx+extend; // extend a bit the image
const int ety = hog->ty+extend;
float_layers* res = NEW(float_layers);
*res = empty_layers(float,etx,ety,nh);
float ninth_val = 0;
_prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt );
if( norm ) norm_layers( res, norm, nt );
return res;
}
inline float sum_array_f(const float* a, int n) {
int i=n;
double res = 0;
while(i--) res+=a[i];
return (float)res;
}
extern "C" {
int sgemm_(char *transa, char *transb, integer *m, integer *
n, integer *k, float *alpha, float *a, integer *lda, float *b, integer *
ldb, float *beta, float *c, integer *ldc);
}
/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
res = dot(patches, convolved_hog)
P*npix P * nh nh * npix
*/
void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) {
int nh = patches->tx;
assert( nh == convolved_hog->tz );
ASSERT_SAME_IMG_SIZE( convolved_hog, res );
int P = patches->ty;
assert( res->tz == P );
int threadP = 1 + (P-1) / n_thread; // how many patches per thread
int npix = (int)IMG_SIZE(convolved_hog);
int l;
#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
#pragma omp parallel for num_threads(n_thread)
#else
n_thread = 1; // BLAS is already multithreaded
threadP = P;
#endif
for(l=0; l<n_thread; l++) {
// we do dotprod( patches[l*threadP : (l+1)*threadP], convolved_hog )
long start = l*threadP;
long end = MIN(P,(l+1)*threadP);
int np = int(end - start);
float* p = patches->pixels + nh*start;
float* r = res->pixels + npix*start;
// blas fast matrix-matrix product
char T='n'; float alpha = 1, beta = 0;
sgemm_( &T, &T, &npix, &np, &nh, &alpha,
convolved_hog->pixels, &npix,
p, &nh, &beta, r, &npix);
}
}
inline void transpose_scalar_block(const float *A, float *B, const int lda, const int ldb,
const int block_row, const int block_col) {
for(int i=0; i<block_row; i++)
for(int j=0; j<block_col; j++)
B[j*ldb + i] = A[i*lda +j];
}
// Transpose A (N rows by M cols) into B (M by N)
void transpose_matrix(const float_image* A, float_image* B, int nt) {
const int n = A->ty, m = A->tx;
assert( n==B->tx && m==B->ty );
const int block_size = 16;
const float* pA = A->pixels;
float* pB = B->pixels;
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(nt)
#endif
for(int i=0; i<n; i+=block_size)
for(int j=0; j<m; j+=block_size)
transpose_scalar_block(&pA[i*m +j], &pB[j*n + i], m, n, MIN(block_size, n-i), MIN(block_size, m-j));
}
extern "C" {
int sgemv_(char *transa, integer *m, integer * n,
float *alpha, float *a, integer *lda,
float *b, integer * ldb, float *beta,
float *c, integer * ldc);
}
/* convolution of each patch within a local neighborhood
ngh_rad = max translation
neighborhood has size 2*ngh_rad
patch at (x,y) is compared to patches in [y-ngh_rad : y+ngh_rad,
x-ngh_rad : y+ngh_rad]
*/
void _dotprod_ngh_rad_T( int_cube* grid, float_image* patches, int ngh_rad,
float_cube* convolved_hog, float_layers* res_out,
int_image* offsets, int n_thread ) {
int nh = patches->tx;
assert( nh == convolved_hog->tz );
const int P = patches->ty;
assert( IMG_SIZE(grid)==P && grid->tz==2 );
const int tx = convolved_hog->tx;
const int ty = convolved_hog->ty;
// neighborhood size
int res_tx = MIN(tx,2*ngh_rad);
int res_ty = MIN(ty,2*ngh_rad);
assert(res_tx<tx-1 || res_ty<ty-1 || !"ngh_rad is too large and results in loss of perf. Set ngh_rad=0 instead.");
int res_npix = res_tx * res_ty;
// allocate result
*res_out = empty_layers(float, res_tx, res_ty, P);
assert(res_out->pixels || !"error: ran out of memory before sgemm");
*offsets = empty_image(int, 2, P);
char T='t'; float alpha=1, beta=0; int one=1;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(int j=0; j<res_ty; ++j) {
// By organizing loops this way,
// we exploit overlap between patches.
for(int l=0; l<P; l++) {
float* p = patches->pixels + l*nh;
float* r = res_out->pixels + l*res_npix;
int left = MAX(0, MIN(grid->pixels[2*l+0] - ngh_rad, tx-2*ngh_rad));
int top = MAX(0, MIN(grid->pixels[2*l+1] - ngh_rad, ty-2*ngh_rad));
if(j==0) {
offsets->pixels[2*l+0] = left;
offsets->pixels[2*l+1] = top;
}
float* c = convolved_hog->pixels + (left + top*tx)*nh;
// blas fast matrix-vector product
sgemv_( &T, &nh, &res_tx, &alpha, c + j*tx*nh, &nh,
p, &one, &beta, r + j*res_tx, &one);
}
}
}
/* correct the convolution on the boundaries of the image
ttx, tty: true shape of the res_map (in case of using offsets)
*/
void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets,
const int ttx, const int tty, int extend, float_layers* res, int n_thread ) {
const int n_patches = patches->ty;
assert( n_patches == res->tz );
//const int nori = patches->tx/pow2(patch_size);
assert( patches->tx >= nori*pow2(patch_size) );
const int tx = res->tx; // real true shape because it has been extended
const int ty = res->ty;
const int first_half = patch_size/2;
const int second_half = patch_size - first_half; // in case patch_size is odd
assert( offsets || (ttx==tx && tty==ty) );
assert( !offsets || (ttx>=tx && tty>=ty) );
assert( !offsets || (offsets->ty==res->tz && offsets->tx==2) );
const long npix = IMG_SIZE(res);
int l;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_patches; l++) {
// load offsets
const int offi = offsets ? offsets->pixels[2*l+0] : 0;
const int offj = offsets ? offsets->pixels[2*l+1] : 0;
float sums[8]; // temporary norm of columns or rows
assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) );
int o,i,j;
// horizontal boundaries
memset(sums,0,sizeof(sums));
float* p = patches->pixels + l*patches->tx;
for(o=0; o<nori; o++)
for(j=0; j<patch_size; j++)
for(i=0; i<patch_size; i++)
sums[j] += pow2(*p++);
float old_norm = sqrt(sum_array_f(sums,patch_size));
if( old_norm==0 ) continue;
// upper boundary
for(j=offj; j<first_half; j++) {
float new_norm = sqrt(sum_array_f(sums+(first_half-j),second_half+j)); // sums to patch_size
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (j-offj)*tx;
for(i=0; i<tx; i++) {
r[i] *= mul;
//assert(r[i]<1.1);
}
}
// lower boundary
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
float new_norm = sqrt(sum_array_f(sums,first_half+tty-extend-j)); // sums to patch_size
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (j-offj)*tx;
for(i=0; i<tx; i++) {
r[i] *= mul;
//assert(r[i]<1.1);
}
}
// vertical boundaries
memset(sums,0,sizeof(sums));
p = patches->pixels + l*patches->tx;
for(o=0; o<nori; o++)
for(j=0; j<patch_size; j++)
for(i=0; i<patch_size; i++)
sums[i] += pow2(*p++);
// left boundary
for(i=offi; i<first_half; i++) {
float new_norm = sqrt(sum_array_f(sums+(first_half-i),second_half+i));
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (i-offi);
for(j=0; j<ty; j++) {
r[j*tx] *= mul;
//assert(r[j*tx]<1.1);
}
}
// right boundary
for(i=ttx-extend+1-second_half; i<offi+tx; i++) {
float new_norm = sqrt(sum_array_f(sums,first_half+ttx-extend-i));
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (i-offi);
for(j=0; j<ty; j++) {
r[j*tx] *= mul;
//assert(r[j*tx]<1.1);
}
}
// because we over-estimated the rectification for the corners, check that they do not overpass old_norm
float* r = res->pixels + l*npix;
for(j=offj; j<first_half; j++) {
for(i=offi; i<first_half; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
}
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
for(i=offi; i<first_half; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
}
}
}
/* Compute the correlation of all patches with the second image (hog).
In case of ngh_rad>0, the correlation is only computed in a small local neighborhood
(whose size is parameterized by ngh_rad).
if extend: width and height of output maps are extended
if norm: correlation are normalized afterwards.
*/
void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad,
int extend, float norm, int nt, res_scale* res ) {
assert(0<=extend and extend<=1);
float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt );
assert( patches->tx==convolved_hog->tz);
res->true_shape[0] = convolved_hog->tx;
res->true_shape[1] = convolved_hog->ty;
//hash_layers(convolved_hog)
int_image* offsets = NULL;
if( ngh_rad == 0 ) { // no limit on translation
// allocate result
res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty);
assert(res->res_map.pixels || !"error: ran out of memory before sgemm");
// multi-threaded fast matrix product
_dotprod( patches, convolved_hog, &res->res_map, nt );
} else { // ngh_rad>0: cropping res_map
offsets = &res->offsets;
// transpose hog: _dotprod is much faster this way
float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz);
{ float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy
float_image B = reshape_z_xy(float, &convolved_hog_T);
transpose_matrix( &A, &B, nt);
}
//hash_cube(&convolved_hog_T)
// resized grid
int_cube fgrid = cube_like(int, &res->grid);
for(int i=0; i<CUBE_SIZE(&fgrid); i++)
fgrid.pixels[i] = res->grid.pixels[i]/res->f;
//hash_cube(&fgrid)
// multi-threaded fast matrix product
_dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt );
free(fgrid.pixels);
free(convolved_hog_T.pixels);
//hash_image(offsets)
}
free_layers(convolved_hog);
// correct border effects on the correlation maps
rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1],
extend, &res->res_map, nt );
}
/* Compute: arr **= p
*/
void fastipow( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
// optimization: precompute some values of pow(x,p)
const int npc = 64;
float precom[npc+1];
for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p);
const float maxindex = npc - 0.001;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* a = arr->pixels + l*npix;
int i;
for(i=0; i<npix; i++) {
// arr[i] = pow(arr[i],p);
float v = a[i]*npc;
assert( v>=0 && v<npc+1 );
if(v>maxindex) v=maxindex;
int n = int(v);
float w = v-n;
a[i] = (1-w)*precom[n] + w*precom[n+1];
}
}
}
/* Compute: arr = max(0,(arr-p)/(1-p))
*/
void fasthinge( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
const float f = 1/(1-p);
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* a = arr->pixels + l*npix;
int i;
for(i=0; i<npix; i++) {
float v = a[i];
a[i] = MAX(0,f*(v-p));
}
}
}
inline int max_array_i(const int* a, int n) {
int i=n;
int res = INT_MIN;
while(i--) if(a[i]>res) res=a[i];
return res;
}
/* Normalize weights in border areas of width <gap>.
There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
sum_divf indicates the current weight in those areas, i.e. values in the area
should be divided by the weight. But trans_inv allow to control the amount of
normalization: 0=no normalization, 1=normal
*/
static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap,
const float trans_inv, float sum_divf[9] ) {
if( trans_inv == 0 ) return;
int i,j;
for(i=0; i<9; i++) {
if( sum_divf[i]>0 )
sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect
}
for(j=0; j<gap; j++) {
if(sum_divf[0])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[0];
if(sum_divf[1])
for(i=gap; i<tx-gap; i++)
rmap[j*tx+i] *= sum_divf[1];
if(sum_divf[2])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[2];
}
for(; j<ty-gap; j++) {
if(sum_divf[3])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[3];
if(sum_divf[5])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[5];
}
for(; j<ty; j++) {
if(sum_divf[6])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[6];
if(sum_divf[7])
for(i=gap; i<tx-gap; i++)
rmap[j*tx+i] *= sum_divf[7];
if(sum_divf[8])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[8];
}
}
/* Compute the (sparse) convolutions specified by <children> on <map> and put the result in <res>.
A standard order is assumed on the children:
a response map #p is built from the children[p] at positions
[(gap*dx,gap*dy) for dy in dys for dx in dxs]
where dxs = [-1,1] or [-1,0,1]
dys = [-1,1] or [-1,0,1]
child_assign denote assignement of the children level, while assign is for the next level
child_norms contain the norms of small patches and norms for big new cells
*/
int _sparse_conv( int_image* children, int_array* child_assign, int gap, float trans_inv,
float_layers* child_map, int_image* offsets, float_array* child_norms, float_array* norms,
int_array* assign, float_layers* res, int_image* res_offsets, int n_thread ) {
const int nconv = children->ty; // number of convolutions to perform
const int nc2 = children->tx;
const int nc = sqrt(nc2);
assert( nc*nc == nc2 );
assert( res->tz == nconv );
const int tx = child_map->tx;
const int ty = child_map->ty;
const long npix = tx*ty;
ASSERT_SAME_SIZE( child_map, res );
const int n_lower_conv = max_array_i(children->pixels,nconv*nc2)+1;
int* cass = child_assign ? child_assign->pixels : NEWA(int,n_lower_conv);
if(!child_assign) {for(int i=0; i<n_lower_conv; i++) cass[i]=i;}
assert( !offsets || (offsets->pixels && offsets->tx==2 && offsets->ty==n_lower_conv &&
res_offsets && res_offsets->tx==2 && res_offsets->ty==nconv) );
if(assign) {
assert(0); // not supposed to happen
} else {
// normal case: no redundancy to exploit in response maps
int l;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<nconv; l++) {
float *rmap = res->pixels + l*npix;
int u,v,c,ncall=0; // children number
const int* const child = children->pixels + l*nc2;
float sum_divf[9];
memset(sum_divf,0,sizeof(sum_divf));
int i,j;
// first, choose an offset for the result rmap from the child offsets
int offx=0, offy=0;
if( offsets ) {
int sum_ox=0, sum_oy=0, w=0;
for(c=v=0; v<nc; v++) {
int dy = (2*v/(nc-1)-1);
for(u=0; u<nc; u++,c++) {
int dx = (2*u/(nc-1)-1);
if(child[c]<0 || cass[child[c]]<0) continue;
sum_ox += offsets->pixels[2*child[c]+0] - dx*gap;
sum_oy += offsets->pixels[2*child[c]+1] - dy*gap;
w++;
}
}
if(w==0) w++; // just in case
offx = (int)floor(0.5 + sum_ox/float(w));
offy = (int)floor(0.5 + sum_oy/float(w));
// store result for later
res_offsets->pixels[2*l+0] = offx;
res_offsets->pixels[2*l+1] = offy;
}
for(c=v=0; v<nc; v++) {
int dy = (2*v/(nc-1)-1);
for(u=0; u<nc; u++,c++) {
int dx = (2*u/(nc-1)-1);
if(child[c]<0 || cass[child[c]]<0) continue;
float divf = child_norms->pixels[child[c]]/norms->pixels[l];
// difference with rmap's offset
const int trans_x = dx*gap + (offsets? offx - offsets->pixels[2*child[c]+0] : 0);
const int trans_y = dy*gap + (offsets? offy - offsets->pixels[2*child[c]+1] : 0);
// count the sum of weights in every image area
for(i=-1; i<=1; i++)for(j=-1; j<=1; j++)
if(i*trans_x<=0 && j*trans_y<=0)
sum_divf[4+j*3+i] += divf;
// add a translated version of map[children[c]] by (ox-dx,oy-dy)
if(ncall++==0) // first call
fast_set_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
else
fast_add_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
}
}
if( ncall == 0) // default = zeros
memset(rmap, 0, npix*sizeof(float));
// now we are supposed to rectify the boundaries (to perfect convolution)
normalize_trans(tx, ty, gap, rmap, trans_inv, sum_divf );
//assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001);
}
}
if(!child_assign) free(cass);
#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)
//CHECK_MAPS(res);
return nconv;
}
Event Timeline
Log In to Comment