Page MenuHomec4science

conv.cpp
No OneTemporary

File Metadata

Created
Mon, Nov 11, 00:14

conv.cpp

/*
Copyright (C) 2014 Jerome Revaud
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
*/
#include "conv.h"
#include "std.h"
#include "omp.h"
#include "maxfilter.h"
extern "C" {
#include <immintrin.h>
#define integer int
#define real float
extern int saxpy_(integer *n, real *sa, real *sx, integer *incx, real *sy, integer *incy);
extern int sscal_(integer *n, real *sa, real *sx, integer *incx);
}
static inline void fast_set_val( float * __restrict__ a, long d, const float val) {
if(val) {
int j;
for(j=0; j<d; j++)
a[j] = val;
} else
memset(a,0,d*sizeof(float));
}
static inline void fast_add_val( float * __restrict__ a, long d, const float val) {
int j;
for(j=0; j<d; j++)
a[j] += val;
}
static inline void fast_set_vec( float * __restrict__ dest,
const float * __restrict__ src, int d, const float mul) {
if( mul==1)
memcpy(dest,src,d*sizeof(float));
else {
int j;
for(j=0; j<d; j++)
dest[j] = mul*src[j];
}
}
static inline void fast_add_vec( float * __restrict__ dest,
const float * __restrict__ add, int d, float mul) {
if(d<=4) {
int j;
for(j=0; j<d; j++)
dest[j] += mul*add[j];
} else {
int inc = 1;
saxpy_( &d, &mul, (float*)add, &inc, (float*)dest, &inc );
}
}
static inline void fast_div( float * __restrict__ a, long d, const float div) {
const float divi = 1/div;
// assert( ((long)a & 15) == 0 && (d & 3) == 0 );
// const float _divi4[] = {divi,divi,divi,divi};
// __v4sf *a4 = (__v4sf*)a;
// __v4sf *divi4 = (__v4sf*)_divi4;
// int e = d>>2;
// while(e--) *a4++ *= (*divi4);
int j;
for(j=0; j<d; j++)
a[j] *= divi;
}
static inline float* fast_set_trans( float * dest, const float * src, const float mul,
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
if(mul==0) {
memset(dest,0,sizeof(float)*(tx+ex)*(ty+ex));
return dest+(tx+ex)*(ty+ex);
}
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
// paste -u zeros cols
if(dx<0) add_default(-dx);
// past image
fast_set_vec(dest,src+j*tx,row_len,mul);
dest += row_len;
// paste +u zeros cols
if(dx>=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline float* fast_add_trans( float * dest, const float * src, const float mul,
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
if(mul==0) return dest+(tx+ex)*(ty+ex);
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
// paste -u zeros cols
if(dx<0) add_default(-dx);
// past image
fast_add_vec(dest,src+j*tx,row_len,mul);
dest += row_len;
// paste +u zeros cols
if(dx>=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline void norm_norm( float* norms, int nb, float mode ) {
int i;
if( mode < 0 )
assert(!"error: unknown norm mode");
else if( mode == 0.5 ) {
for(i=0; i<nb; i++)
norms[i] = sqrt(sqrt(norms[i]));
} else if( mode < 1 ) {
mode *= 0.5; // cumulate with initial 1/sqrt(.)
for(i=0; i<nb; i++)
norms[i] = pow(norms[i], mode);
} else if( mode == 1 ) {
for(i=0; i<nb; i++)
norms[i] = sqrt(norms[i]);
} else if( mode > 1 )
assert(!"error: unknown norm mode");
}
/* normalize each pixel of a multi-layers image
norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)**<norm> }
*/
void norm_layers( float_layers* res, float norm, int n_thread ) {
if(norm==0) return;
const int layer_size = res->tx*res->ty;
const int n_layers = res->tz;
float* norms = NEWAC(float,layer_size);
long l;
for(l=0; l<n_layers; l++) {
float* r = res->pixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; i<layer_size; i++)
norms[i] += r[i]*r[i];
}
norm_norm( norms, layer_size, norm );
for(l=0; l<n_layers; l++) {
float* r = res->pixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; i<layer_size; i++)
r[i] /= norms[i]+1e-8;
}
free(norms);
}
/* Return the vectorized dimension of a HOG patch
*/
int get_patch_desc_dim( float_layers* hog, int patch_size )
{
return patch_size*patch_size * hog->tz; // number of dimensions of an atomic patch descriptor
}
/* Sample a set of patches from a HOG image.
grid : array of (x,y) position of the patches
size: size of the patches, ie. [x,x+size[ x [y,y+size[
res: result array, n_patches x desc_dim
desc_dim = n_layers * size**2
norms: result, n_patches x 1, norm of each patch
*/
void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm,
float_image* res, float_array* norms, int n_thread ) {
const int tx = hog->tx;
const long npix = tx*hog->ty;
assert( grid->tx == 2 );
const int n_patches = grid->ty;
assert( res->ty == n_patches );
const int n_layers = hog->tz;
const int n_colors = (color? color->tz: 0);
const int color_npix = (color? color->tx*color->ty: 0);
const int desc_size = size*size*n_layers + (color? color->tz: 0);
assert(res->tx == desc_size );
int n;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int *p = grid->pixels + 2*n;
// copy hog
int x=p[0],y=p[1];
assert(0<=x && x+size<=tx);
assert(0<=y && y+size<=hog->ty);
int l,j;
for(l=0; l<n_layers; l++) {
float* h = hog->pixels + l*npix + y*tx + x;
for(j=0; j<size; j++) {
memcpy(r, h, size*sizeof(float));
h += tx;
r += size;
}
}
if(!color) continue;
// copy color
float* c = color->pixels + (y+size/2)*color->ty + (x+size/2);
for(l=0; l<n_colors; l++)
*r++ = c[l*color_npix];
}
if(norm) {
float* normp = norms ? norms->pixels : NEWAC(float, n_patches);
if(norms) {
assert(norms->tx==n_patches);
memset(normp,0,n_patches*sizeof(float));
}
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int l;
for(l=0; l<desc_size; l++)
normp[n] += r[l]*r[l];
}
norm_norm( normp, n_patches, norm );
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; n<n_patches; n++) {
float *r = res->pixels + desc_size*n;
int l;
float nn = normp[n]+1e-8;
for(l=0; l<desc_size; l++)
r[l] /= nn;
}
if(!norms) free(normp);
}
}
static inline int retrieve_children( const int x, const int y, const int_cube* child_grid ) {
const int size0_div2 = child_grid->pixels[0];
const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 :
MAX( child_grid->pixels[2]-child_grid->pixels[0],
child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] );
int i = (x-size0_div2)/step0;
int j = (y-size0_div2)/step0;
assert( x==(i*step0+size0_div2) || !"error: child_grid does not match current grid" );
assert( y==(j*step0+size0_div2) || !"error: child_grid does not match current grid" );
if( i<0 || i>=child_grid->tx ) return -1;
if( j<0 || j>=child_grid->ty ) return -1;
return i+j*child_grid->tx;
}
/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
size = size of cells at current scale
offset, step = grid generator: (offset + i*step, offset + j*step)
child_grid = grid of the previous layer (or None if first layer)
child_norms = image containing the norms of the patch at the previous level
grid = result center positions of cells in current scale
children = index of cells in previous scale used to construct big cells
norms = norms of the cells of this level
*/
void _prepare_big_cells( int size, int offset, int step,
int_cube* child_grid, float_image* child_norms,
int_cube* grid, int_cube* children, float_image* norms ) {
assert(grid->tz==2);
const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border
const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border
/* grid[i,j] = ( offset + i*step, offset + j*step )
connection between two scales:
x cell position in lower scale == x position of children in upper scale
child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4
*/
int i,j,u,v;
int* r = grid->pixels;
if( !child_grid ) {
// this is the first scale:
// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
for(j=0; j<nty; j++)
for(i=0; i<ntx; i++) {
*r++ = offset + i*step;
*r++ = offset + j*step;
}
} else {
assert(child_grid->tz==2);
ASSERT_SAME_SIZE( child_grid, child_norms );
assert( children );
const int nc = sqrt(children->tz); // number of children per row or col
assert( children->tz==pow2(nc) );
ASSERT_SAME_SIZE( grid, children );
ASSERT_SAME_SIZE( grid, norms );
// this is at least second scale
// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
const int quarter = size/4;
assert(4*quarter==size);
int* c = children->pixels;
float *n = norms->pixels;
memset(n,0,ntx*nty*sizeof(float));
for(j=0; j<nty; j++)
for(i=0; i<ntx; i++) {
int x = offset + i*step;
int y = offset + j*step;
*r++ = x;
*r++ = y;
// accumulate norms from 2x2 or 3x3 neighbors
for(v=0; v<nc; v++)
for(u=0; u<nc; u++,c++) {
// we want to index the children at position:
// ( center_x + (2*u/(nc-1)-1)*size/4, center_y + (2*v/(nc-1)-1)*size/4 )
*c = retrieve_children( x+(2*u/(nc-1)-1)*quarter, y+(2*v/(nc-1)-1)*quarter, child_grid );
if(*c>=0) *n += child_norms->pixels[*c];
}
n++;
}
}
}
/* Prepare image for dotprod : dot(patches, res)
where patches is n_patches x patch_dim
set outside of the image to be equal to (0,...,ninth_val)
*/
void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend,
float_layers* res, int n_thread ) {
assert( img->tx+extend == res->tx );
assert( img->ty+extend == res->ty );
const int n_layers = img->tz;
const int tx = img->tx;
const int ty = img->ty;
const int npix = tx*ty;
const int npixex = (tx+extend)*(ty+extend);
assert( res->tz==patch_size*patch_size*img->tz );
long l;
const int first_half = patch_size/2; // half-size
const int second_half = patch_size - first_half;
const int layer_size = patch_size*patch_size*npixex;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* img_pix = img->pixels + l*npix;
float* r = res->pixels + l*layer_size;
int u,v;
// copy translated version of the image into res
for(v=-first_half; v<second_half; v++)
for(u=-first_half; u<second_half; u++)
r = fast_set_trans( r, img_pix, 1, u, v, tx, ty, extend, l+1<n_layers? 0 : ninth_val );
}
}
float_layers* prepare_dotprod_convolution( float_layers* hog, int patch_size, int extend, float norm, int nt )
{
assert(0<=extend and extend<=1);
const int nh = get_patch_desc_dim(hog,patch_size);
const int etx = hog->tx+extend; // extend a bit the image
const int ety = hog->ty+extend;
float_layers* res = NEW(float_layers);
*res = empty_layers(float,etx,ety,nh);
float ninth_val = 0;
_prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt );
if( norm ) norm_layers( res, norm, nt );
return res;
}
inline float sum_array_f(const float* a, int n) {
int i=n;
double res = 0;
while(i--) res+=a[i];
return (float)res;
}
extern "C" {
int sgemm_(char *transa, char *transb, integer *m, integer *
n, integer *k, float *alpha, float *a, integer *lda, float *b, integer *
ldb, float *beta, float *c, integer *ldc);
}
/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
res = dot(patches, convolved_hog)
P*npix P * nh nh * npix
*/
void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) {
int nh = patches->tx;
assert( nh == convolved_hog->tz );
ASSERT_SAME_IMG_SIZE( convolved_hog, res );
int P = patches->ty;
assert( res->tz == P );
int threadP = 1 + (P-1) / n_thread; // how many patches per thread
int npix = (int)IMG_SIZE(convolved_hog);
int l;
#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
#pragma omp parallel for num_threads(n_thread)
#else
n_thread = 1; // BLAS is already multithreaded
threadP = P;
#endif
for(l=0; l<n_thread; l++) {
// we do dotprod( patches[l*threadP : (l+1)*threadP], convolved_hog )
long start = l*threadP;
long end = MIN(P,(l+1)*threadP);
int np = int(end - start);
float* p = patches->pixels + nh*start;
float* r = res->pixels + npix*start;
// blas fast matrix-matrix product
char T='n'; float alpha = 1, beta = 0;
sgemm_( &T, &T, &npix, &np, &nh, &alpha,
convolved_hog->pixels, &npix,
p, &nh, &beta, r, &npix);
}
}
inline void transpose_scalar_block(const float *A, float *B, const int lda, const int ldb,
const int block_row, const int block_col) {
for(int i=0; i<block_row; i++)
for(int j=0; j<block_col; j++)
B[j*ldb + i] = A[i*lda +j];
}
// Transpose A (N rows by M cols) into B (M by N)
void transpose_matrix(const float_image* A, float_image* B, int nt) {
const int n = A->ty, m = A->tx;
assert( n==B->tx && m==B->ty );
const int block_size = 16;
const float* pA = A->pixels;
float* pB = B->pixels;
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(nt)
#endif
for(int i=0; i<n; i+=block_size)
for(int j=0; j<m; j+=block_size)
transpose_scalar_block(&pA[i*m +j], &pB[j*n + i], m, n, MIN(block_size, n-i), MIN(block_size, m-j));
}
extern "C" {
int sgemv_(char *transa, integer *m, integer * n,
float *alpha, float *a, integer *lda,
float *b, integer * ldb, float *beta,
float *c, integer * ldc);
}
/* convolution of each patch within a local neighborhood
ngh_rad = max translation
neighborhood has size 2*ngh_rad
patch at (x,y) is compared to patches in [y-ngh_rad : y+ngh_rad,
x-ngh_rad : y+ngh_rad]
*/
void _dotprod_ngh_rad_T( int_cube* grid, float_image* patches, int ngh_rad,
float_cube* convolved_hog, float_layers* res_out,
int_image* offsets, int n_thread ) {
int nh = patches->tx;
assert( nh == convolved_hog->tz );
const int P = patches->ty;
assert( IMG_SIZE(grid)==P && grid->tz==2 );
const int tx = convolved_hog->tx;
const int ty = convolved_hog->ty;
// neighborhood size
int res_tx = MIN(tx,2*ngh_rad);
int res_ty = MIN(ty,2*ngh_rad);
assert(res_tx<tx-1 || res_ty<ty-1 || !"ngh_rad is too large and results in loss of perf. Set ngh_rad=0 instead.");
int res_npix = res_tx * res_ty;
// allocate result
*res_out = empty_layers(float, res_tx, res_ty, P);
assert(res_out->pixels || !"error: ran out of memory before sgemm");
*offsets = empty_image(int, 2, P);
char T='t'; float alpha=1, beta=0; int one=1;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(int j=0; j<res_ty; ++j) {
// By organizing loops this way,
// we exploit overlap between patches.
for(int l=0; l<P; l++) {
float* p = patches->pixels + l*nh;
float* r = res_out->pixels + l*res_npix;
int left = MAX(0, MIN(grid->pixels[2*l+0] - ngh_rad, tx-2*ngh_rad));
int top = MAX(0, MIN(grid->pixels[2*l+1] - ngh_rad, ty-2*ngh_rad));
if(j==0) {
offsets->pixels[2*l+0] = left;
offsets->pixels[2*l+1] = top;
}
float* c = convolved_hog->pixels + (left + top*tx)*nh;
// blas fast matrix-vector product
sgemv_( &T, &nh, &res_tx, &alpha, c + j*tx*nh, &nh,
p, &one, &beta, r + j*res_tx, &one);
}
}
}
/* correct the convolution on the boundaries of the image
ttx, tty: true shape of the res_map (in case of using offsets)
*/
void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets,
const int ttx, const int tty, int extend, float_layers* res, int n_thread ) {
const int n_patches = patches->ty;
assert( n_patches == res->tz );
//const int nori = patches->tx/pow2(patch_size);
assert( patches->tx >= nori*pow2(patch_size) );
const int tx = res->tx; // real true shape because it has been extended
const int ty = res->ty;
const int first_half = patch_size/2;
const int second_half = patch_size - first_half; // in case patch_size is odd
assert( offsets || (ttx==tx && tty==ty) );
assert( !offsets || (ttx>=tx && tty>=ty) );
assert( !offsets || (offsets->ty==res->tz && offsets->tx==2) );
const long npix = IMG_SIZE(res);
int l;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_patches; l++) {
// load offsets
const int offi = offsets ? offsets->pixels[2*l+0] : 0;
const int offj = offsets ? offsets->pixels[2*l+1] : 0;
float sums[8]; // temporary norm of columns or rows
assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) );
int o,i,j;
// horizontal boundaries
memset(sums,0,sizeof(sums));
float* p = patches->pixels + l*patches->tx;
for(o=0; o<nori; o++)
for(j=0; j<patch_size; j++)
for(i=0; i<patch_size; i++)
sums[j] += pow2(*p++);
float old_norm = sqrt(sum_array_f(sums,patch_size));
if( old_norm==0 ) continue;
// upper boundary
for(j=offj; j<first_half; j++) {
float new_norm = sqrt(sum_array_f(sums+(first_half-j),second_half+j)); // sums to patch_size
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (j-offj)*tx;
for(i=0; i<tx; i++) {
r[i] *= mul;
//assert(r[i]<1.1);
}
}
// lower boundary
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
float new_norm = sqrt(sum_array_f(sums,first_half+tty-extend-j)); // sums to patch_size
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (j-offj)*tx;
for(i=0; i<tx; i++) {
r[i] *= mul;
//assert(r[i]<1.1);
}
}
// vertical boundaries
memset(sums,0,sizeof(sums));
p = patches->pixels + l*patches->tx;
for(o=0; o<nori; o++)
for(j=0; j<patch_size; j++)
for(i=0; i<patch_size; i++)
sums[i] += pow2(*p++);
// left boundary
for(i=offi; i<first_half; i++) {
float new_norm = sqrt(sum_array_f(sums+(first_half-i),second_half+i));
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (i-offi);
for(j=0; j<ty; j++) {
r[j*tx] *= mul;
//assert(r[j*tx]<1.1);
}
}
// right boundary
for(i=ttx-extend+1-second_half; i<offi+tx; i++) {
float new_norm = sqrt(sum_array_f(sums,first_half+ttx-extend-i));
float mul = old_norm / (new_norm + 1e-8);
float* r = res->pixels + l*npix + (i-offi);
for(j=0; j<ty; j++) {
r[j*tx] *= mul;
//assert(r[j*tx]<1.1);
}
}
// because we over-estimated the rectification for the corners, check that they do not overpass old_norm
float* r = res->pixels + l*npix;
for(j=offj; j<first_half; j++) {
for(i=offi; i<first_half; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
}
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
for(i=offi; i<first_half; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
}
}
}
/* Compute the correlation of all patches with the second image (hog).
In case of ngh_rad>0, the correlation is only computed in a small local neighborhood
(whose size is parameterized by ngh_rad).
if extend: width and height of output maps are extended
if norm: correlation are normalized afterwards.
*/
void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad,
int extend, float norm, int nt, res_scale* res ) {
assert(0<=extend and extend<=1);
float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt );
assert( patches->tx==convolved_hog->tz);
res->true_shape[0] = convolved_hog->tx;
res->true_shape[1] = convolved_hog->ty;
//hash_layers(convolved_hog)
int_image* offsets = NULL;
if( ngh_rad == 0 ) { // no limit on translation
// allocate result
res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty);
assert(res->res_map.pixels || !"error: ran out of memory before sgemm");
// multi-threaded fast matrix product
_dotprod( patches, convolved_hog, &res->res_map, nt );
} else { // ngh_rad>0: cropping res_map
offsets = &res->offsets;
// transpose hog: _dotprod is much faster this way
float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz);
{ float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy
float_image B = reshape_z_xy(float, &convolved_hog_T);
transpose_matrix( &A, &B, nt);
}
//hash_cube(&convolved_hog_T)
// resized grid
int_cube fgrid = cube_like(int, &res->grid);
for(int i=0; i<CUBE_SIZE(&fgrid); i++)
fgrid.pixels[i] = res->grid.pixels[i]/res->f;
//hash_cube(&fgrid)
// multi-threaded fast matrix product
_dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt );
free(fgrid.pixels);
free(convolved_hog_T.pixels);
//hash_image(offsets)
}
free_layers(convolved_hog);
// correct border effects on the correlation maps
rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1],
extend, &res->res_map, nt );
}
/* Compute: arr **= p
*/
void fastipow( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
// optimization: precompute some values of pow(x,p)
const int npc = 64;
float precom[npc+1];
for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p);
const float maxindex = npc - 0.001;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* a = arr->pixels + l*npix;
int i;
for(i=0; i<npix; i++) {
// arr[i] = pow(arr[i],p);
float v = a[i]*npc;
assert( v>=0 && v<npc+1 );
if(v>maxindex) v=maxindex;
int n = int(v);
float w = v-n;
a[i] = (1-w)*precom[n] + w*precom[n+1];
}
}
}
/* Compute: arr = max(0,(arr-p)/(1-p))
*/
void fasthinge( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
const float f = 1/(1-p);
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<n_layers; l++) {
float* a = arr->pixels + l*npix;
int i;
for(i=0; i<npix; i++) {
float v = a[i];
a[i] = MAX(0,f*(v-p));
}
}
}
inline int max_array_i(const int* a, int n) {
int i=n;
int res = INT_MIN;
while(i--) if(a[i]>res) res=a[i];
return res;
}
/* Normalize weights in border areas of width <gap>.
There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
sum_divf indicates the current weight in those areas, i.e. values in the area
should be divided by the weight. But trans_inv allow to control the amount of
normalization: 0=no normalization, 1=normal
*/
static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap,
const float trans_inv, float sum_divf[9] ) {
if( trans_inv == 0 ) return;
int i,j;
for(i=0; i<9; i++) {
if( sum_divf[i]>0 )
sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect
}
for(j=0; j<gap; j++) {
if(sum_divf[0])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[0];
if(sum_divf[1])
for(i=gap; i<tx-gap; i++)
rmap[j*tx+i] *= sum_divf[1];
if(sum_divf[2])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[2];
}
for(; j<ty-gap; j++) {
if(sum_divf[3])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[3];
if(sum_divf[5])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[5];
}
for(; j<ty; j++) {
if(sum_divf[6])
for(i=0; i<gap; i++)
rmap[j*tx+i] *= sum_divf[6];
if(sum_divf[7])
for(i=gap; i<tx-gap; i++)
rmap[j*tx+i] *= sum_divf[7];
if(sum_divf[8])
for(i=tx-gap; i<tx; i++)
rmap[j*tx+i] *= sum_divf[8];
}
}
/* Compute the (sparse) convolutions specified by <children> on <map> and put the result in <res>.
A standard order is assumed on the children:
a response map #p is built from the children[p] at positions
[(gap*dx,gap*dy) for dy in dys for dx in dxs]
where dxs = [-1,1] or [-1,0,1]
dys = [-1,1] or [-1,0,1]
child_assign denote assignement of the children level, while assign is for the next level
child_norms contain the norms of small patches and norms for big new cells
*/
int _sparse_conv( int_image* children, int_array* child_assign, int gap, float trans_inv,
float_layers* child_map, int_image* offsets, float_array* child_norms, float_array* norms,
int_array* assign, float_layers* res, int_image* res_offsets, int n_thread ) {
const int nconv = children->ty; // number of convolutions to perform
const int nc2 = children->tx;
const int nc = sqrt(nc2);
assert( nc*nc == nc2 );
assert( res->tz == nconv );
const int tx = child_map->tx;
const int ty = child_map->ty;
const long npix = tx*ty;
ASSERT_SAME_SIZE( child_map, res );
const int n_lower_conv = max_array_i(children->pixels,nconv*nc2)+1;
int* cass = child_assign ? child_assign->pixels : NEWA(int,n_lower_conv);
if(!child_assign) {for(int i=0; i<n_lower_conv; i++) cass[i]=i;}
assert( !offsets || (offsets->pixels && offsets->tx==2 && offsets->ty==n_lower_conv &&
res_offsets && res_offsets->tx==2 && res_offsets->ty==nconv) );
if(assign) {
assert(0); // not supposed to happen
} else {
// normal case: no redundancy to exploit in response maps
int l;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; l<nconv; l++) {
float *rmap = res->pixels + l*npix;
int u,v,c,ncall=0; // children number
const int* const child = children->pixels + l*nc2;
float sum_divf[9];
memset(sum_divf,0,sizeof(sum_divf));
int i,j;
// first, choose an offset for the result rmap from the child offsets
int offx=0, offy=0;
if( offsets ) {
int sum_ox=0, sum_oy=0, w=0;
for(c=v=0; v<nc; v++) {
int dy = (2*v/(nc-1)-1);
for(u=0; u<nc; u++,c++) {
int dx = (2*u/(nc-1)-1);
if(child[c]<0 || cass[child[c]]<0) continue;
sum_ox += offsets->pixels[2*child[c]+0] - dx*gap;
sum_oy += offsets->pixels[2*child[c]+1] - dy*gap;
w++;
}
}
if(w==0) w++; // just in case
offx = (int)floor(0.5 + sum_ox/float(w));
offy = (int)floor(0.5 + sum_oy/float(w));
// store result for later
res_offsets->pixels[2*l+0] = offx;
res_offsets->pixels[2*l+1] = offy;
}
for(c=v=0; v<nc; v++) {
int dy = (2*v/(nc-1)-1);
for(u=0; u<nc; u++,c++) {
int dx = (2*u/(nc-1)-1);
if(child[c]<0 || cass[child[c]]<0) continue;
float divf = child_norms->pixels[child[c]]/norms->pixels[l];
// difference with rmap's offset
const int trans_x = dx*gap + (offsets? offx - offsets->pixels[2*child[c]+0] : 0);
const int trans_y = dy*gap + (offsets? offy - offsets->pixels[2*child[c]+1] : 0);
// count the sum of weights in every image area
for(i=-1; i<=1; i++)for(j=-1; j<=1; j++)
if(i*trans_x<=0 && j*trans_y<=0)
sum_divf[4+j*3+i] += divf;
// add a translated version of map[children[c]] by (ox-dx,oy-dy)
if(ncall++==0) // first call
fast_set_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
else
fast_add_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
}
}
if( ncall == 0) // default = zeros
memset(rmap, 0, npix*sizeof(float));
// now we are supposed to rectify the boundaries (to perfect convolution)
normalize_trans(tx, ty, gap, rmap, trans_inv, sum_divf );
//assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001);
}
}
if(!child_assign) free(cass);
#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)
//CHECK_MAPS(res);
return nconv;
}

Event Timeline