conv.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Nov 11, 00:14

conv.cpp
View Options

	/*
	Copyright (C) 2014 Jerome Revaud

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <http://www.gnu.org/licenses/>
	*/
	#include "conv.h"
	#include "std.h"
	#include "omp.h"
	#include "maxfilter.h"

	extern "C" {
	#include <immintrin.h>
	#define integer int
	#define real float
	extern int saxpy_(integer n, real sa, real sx, integer incx, real sy, integer incy);
	extern int sscal_(integer n, real sa, real sx, integer incx);
	}


	static inline void fast_set_val( float * __restrict__ a, long d, const float val) {
	if(val) {
	int j;
	for(j=0; j<d; j++)
	a[j] = val;
	} else
	memset(a,0,d*sizeof(float));
	}
	static inline void fast_add_val( float * __restrict__ a, long d, const float val) {
	int j;
	for(j=0; j<d; j++)
	a[j] += val;
	}
	static inline void fast_set_vec( float * __restrict__ dest,
	const float * __restrict__ src, int d, const float mul) {
	if( mul==1)
	memcpy(dest,src,d*sizeof(float));
	else {
	int j;
	for(j=0; j<d; j++)
	dest[j] = mul*src[j];
	}
	}
	static inline void fast_add_vec( float * __restrict__ dest,
	const float * __restrict__ add, int d, float mul) {
	if(d<=4) {
	int j;
	for(j=0; j<d; j++)
	dest[j] += mul*add[j];
	} else {
	int inc = 1;
	saxpy_( &d, &mul, (float)add, &inc, (float)dest, &inc );
	}
	}
	static inline void fast_div( float * __restrict__ a, long d, const float div) {
	const float divi = 1/div;
	// assert( ((long)a & 15) == 0 && (d & 3) == 0 );
	// const float _divi4[] = {divi,divi,divi,divi};
	// __v4sf a4 = (__v4sf)a;
	// __v4sf divi4 = (__v4sf)_divi4;
	// int e = d>>2;
	// while(e--) a4++ = (*divi4);
	int j;
	for(j=0; j<d; j++)
	a[j] *= divi;
	}

	static inline float* fast_set_trans( float * dest, const float * src, const float mul,
	int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
	if(mul==0) {
	memset(dest,0,sizeof(float)(tx+ex)(ty+ex));
	return dest+(tx+ex)*(ty+ex);
	}
	if(dx>tx) dx=tx; // after those alues, nothing happens anyway
	if(dy>ty) dy=ty;
	if(-dx>tx) dx=-tx;
	if(-dy>ty) dy=-ty;

	#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
	float* _dest = dest;

	// paste -v zeros rows
	if(dy<0) add_default(-dy*(tx+ex));

	src += MAX(0,dx);
	const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
	int j;
	for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {

	// paste -u zeros cols
	if(dx<0) add_default(-dx);

	// past image
	fast_set_vec(dest,src+j*tx,row_len,mul);
	dest += row_len;

	// paste +u zeros cols
	if(dx>=0) {add_default(dx)
	if(ex) add_default(ex)}
	}

	// paste +v zeros rows
	if(dy>=0){add_default(dy*(tx+ex))
	if(ex) add_default(ex*(tx+ex))}

	#undef add_default
	assert( dest-_dest == (tx+ex)*(ty+ex) );
	return dest;
	}

	static inline float* fast_add_trans( float * dest, const float * src, const float mul,
	int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
	if(mul==0) return dest+(tx+ex)*(ty+ex);
	if(dx>tx) dx=tx; // after those alues, nothing happens anyway
	if(dy>ty) dy=ty;
	if(-dx>tx) dx=-tx;
	if(-dy>ty) dy=-ty;
	#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
	float* _dest = dest;

	// paste -v zeros rows
	if(dy<0) add_default(-dy*(tx+ex));

	src += MAX(0,dx);
	const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
	int j;
	for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {

	// paste -u zeros cols
	if(dx<0) add_default(-dx);

	// past image
	fast_add_vec(dest,src+j*tx,row_len,mul);
	dest += row_len;

	// paste +u zeros cols
	if(dx>=0) {add_default(dx)
	if(ex) add_default(ex)}
	}

	// paste +v zeros rows
	if(dy>=0){add_default(dy*(tx+ex))
	if(ex) add_default(ex*(tx+ex))}

	#undef add_default
	assert( dest-_dest == (tx+ex)*(ty+ex) );
	return dest;
	}


	static inline void norm_norm( float* norms, int nb, float mode ) {
	int i;
	if( mode < 0 )
	assert(!"error: unknown norm mode");
	else if( mode == 0.5 ) {
	for(i=0; i<nb; i++)
	norms[i] = sqrt(sqrt(norms[i]));
	} else if( mode < 1 ) {
	mode *= 0.5; // cumulate with initial 1/sqrt(.)
	for(i=0; i<nb; i++)
	norms[i] = pow(norms[i], mode);
	} else if( mode == 1 ) {
	for(i=0; i<nb; i++)
	norms[i] = sqrt(norms[i]);
	} else if( mode > 1 )
	assert(!"error: unknown norm mode");
	}


	/* normalize each pixel of a multi-layers image
	norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)**<norm> }
	*/
	void norm_layers( float_layers* res, float norm, int n_thread ) {
	if(norm==0) return;

	const int layer_size = res->tx*res->ty;
	const int n_layers = res->tz;
	float* norms = NEWAC(float,layer_size);
	long l;

	for(l=0; l<n_layers; l++) {
	float* r = res->pixels + l*layer_size;
	int i;
	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(i=0; i<layer_size; i++)
	norms[i] += r[i]*r[i];
	}
	norm_norm( norms, layer_size, norm );

	for(l=0; l<n_layers; l++) {
	float* r = res->pixels + l*layer_size;
	int i;
	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(i=0; i<layer_size; i++)
	r[i] /= norms[i]+1e-8;
	}

	free(norms);
	}


	/* Return the vectorized dimension of a HOG patch
	*/
	int get_patch_desc_dim( float_layers* hog, int patch_size )
	{
	return patch_sizepatch_size hog->tz; // number of dimensions of an atomic patch descriptor
	}


	/* Sample a set of patches from a HOG image.
	grid : array of (x,y) position of the patches
	size: size of the patches, ie. [x,x+size[ x [y,y+size[
	res: result array, n_patches x desc_dim
	desc_dim = n_layers * size**2
	norms: result, n_patches x 1, norm of each patch
	*/
	void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm,
	float_image* res, float_array* norms, int n_thread ) {
	const int tx = hog->tx;
	const long npix = tx*hog->ty;
	assert( grid->tx == 2 );
	const int n_patches = grid->ty;
	assert( res->ty == n_patches );
	const int n_layers = hog->tz;
	const int n_colors = (color? color->tz: 0);
	const int color_npix = (color? color->tx*color->ty: 0);
	const int desc_size = sizesizen_layers + (color? color->tz: 0);
	assert(res->tx == desc_size );

	int n;

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(n=0; n<n_patches; n++) {
	float r = res->pixels + desc_sizen;
	int p = grid->pixels + 2n;
	// copy hog
	int x=p[0],y=p[1];
	assert(0<=x && x+size<=tx);
	assert(0<=y && y+size<=hog->ty);
	int l,j;
	for(l=0; l<n_layers; l++) {
	float* h = hog->pixels + lnpix + ytx + x;
	for(j=0; j<size; j++) {
	memcpy(r, h, size*sizeof(float));
	h += tx;
	r += size;
	}
	}
	if(!color) continue;
	// copy color
	float* c = color->pixels + (y+size/2)*color->ty + (x+size/2);
	for(l=0; l<n_colors; l++)
	r++ = c[lcolor_npix];
	}

	if(norm) {
	float* normp = norms ? norms->pixels : NEWAC(float, n_patches);
	if(norms) {
	assert(norms->tx==n_patches);
	memset(normp,0,n_patches*sizeof(float));
	}

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(n=0; n<n_patches; n++) {
	float r = res->pixels + desc_sizen;
	int l;
	for(l=0; l<desc_size; l++)
	normp[n] += r[l]*r[l];
	}
	norm_norm( normp, n_patches, norm );

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(n=0; n<n_patches; n++) {
	float r = res->pixels + desc_sizen;
	int l;
	float nn = normp[n]+1e-8;
	for(l=0; l<desc_size; l++)
	r[l] /= nn;
	}

	if(!norms) free(normp);
	}
	}




	static inline int retrieve_children( const int x, const int y, const int_cube* child_grid ) {
	const int size0_div2 = child_grid->pixels[0];
	const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 :
	MAX( child_grid->pixels[2]-child_grid->pixels[0],
	child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] );
	int i = (x-size0_div2)/step0;
	int j = (y-size0_div2)/step0;
	assert( x==(i*step0+size0_div2) \|\| !"error: child_grid does not match current grid" );
	assert( y==(j*step0+size0_div2) \|\| !"error: child_grid does not match current grid" );
	if( i<0 \|\| i>=child_grid->tx ) return -1;
	if( j<0 \|\| j>=child_grid->ty ) return -1;
	return i+j*child_grid->tx;
	}

	/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
	size = size of cells at current scale
	offset, step = grid generator: (offset + istep, offset + jstep)
	child_grid = grid of the previous layer (or None if first layer)
	child_norms = image containing the norms of the patch at the previous level
	grid = result center positions of cells in current scale
	children = index of cells in previous scale used to construct big cells
	norms = norms of the cells of this level
	*/
	void _prepare_big_cells( int size, int offset, int step,
	int_cube* child_grid, float_image* child_norms,
	int_cube* grid, int_cube* children, float_image* norms ) {
	assert(grid->tz==2);
	const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border
	const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border

	/* grid[i,j] = ( offset + istep, offset + jstep )

	connection between two scales:
	x cell position in lower scale == x position of children in upper scale
	child_offset + child_ichild_step = offset + istep + (2u/(nc-1)-1)size/4
	*/

	int i,j,u,v;
	int* r = grid->pixels;

	if( !child_grid ) {
	// this is the first scale:
	// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[

	for(j=0; j<nty; j++)
	for(i=0; i<ntx; i++) {
	r++ = offset + istep;
	r++ = offset + jstep;
	}
	} else {
	assert(child_grid->tz==2);
	ASSERT_SAME_SIZE( child_grid, child_norms );
	assert( children );
	const int nc = sqrt(children->tz); // number of children per row or col
	assert( children->tz==pow2(nc) );
	ASSERT_SAME_SIZE( grid, children );
	ASSERT_SAME_SIZE( grid, norms );
	// this is at least second scale
	// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[

	const int quarter = size/4;
	assert(4*quarter==size);
	int* c = children->pixels;
	float *n = norms->pixels;
	memset(n,0,ntxntysizeof(float));
	for(j=0; j<nty; j++)
	for(i=0; i<ntx; i++) {
	int x = offset + i*step;
	int y = offset + j*step;
	*r++ = x;
	*r++ = y;

	// accumulate norms from 2x2 or 3x3 neighbors
	for(v=0; v<nc; v++)
	for(u=0; u<nc; u++,c++) {
	// we want to index the children at position:
	// ( center_x + (2u/(nc-1)-1)size/4, center_y + (2v/(nc-1)-1)size/4 )
	c = retrieve_children( x+(2u/(nc-1)-1)quarter, y+(2v/(nc-1)-1)*quarter, child_grid );
	if(c>=0) n += child_norms->pixels[*c];
	}
	n++;
	}
	}
	}


	/* Prepare image for dotprod : dot(patches, res)
	where patches is n_patches x patch_dim
	set outside of the image to be equal to (0,...,ninth_val)
	*/
	void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend,
	float_layers* res, int n_thread ) {
	assert( img->tx+extend == res->tx );
	assert( img->ty+extend == res->ty );
	const int n_layers = img->tz;
	const int tx = img->tx;
	const int ty = img->ty;
	const int npix = tx*ty;
	const int npixex = (tx+extend)*(ty+extend);
	assert( res->tz==patch_sizepatch_sizeimg->tz );

	long l;
	const int first_half = patch_size/2; // half-size
	const int second_half = patch_size - first_half;
	const int layer_size = patch_sizepatch_sizenpixex;

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(l=0; l<n_layers; l++) {
	float* img_pix = img->pixels + l*npix;
	float* r = res->pixels + l*layer_size;
	int u,v;
	// copy translated version of the image into res
	for(v=-first_half; v<second_half; v++)
	for(u=-first_half; u<second_half; u++)
	r = fast_set_trans( r, img_pix, 1, u, v, tx, ty, extend, l+1<n_layers? 0 : ninth_val );
	}
	}


	float_layers* prepare_dotprod_convolution( float_layers* hog, int patch_size, int extend, float norm, int nt )
	{
	assert(0<=extend and extend<=1);
	const int nh = get_patch_desc_dim(hog,patch_size);
	const int etx = hog->tx+extend; // extend a bit the image
	const int ety = hog->ty+extend;

	float_layers* res = NEW(float_layers);
	*res = empty_layers(float,etx,ety,nh);

	float ninth_val = 0;
	_prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt );

	if( norm ) norm_layers( res, norm, nt );
	return res;
	}


	inline float sum_array_f(const float* a, int n) {
	int i=n;
	double res = 0;
	while(i--) res+=a[i];
	return (float)res;
	}


	extern "C" {
	int sgemm_(char transa, char transb, integer m, integer
	n, integer k, float alpha, float a, integer lda, float b, integer
	ldb, float beta, float c, integer *ldc);
	}

	/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
	res = dot(patches, convolved_hog)
	Pnpix P nh nh * npix
	*/
	void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) {
	int nh = patches->tx;
	assert( nh == convolved_hog->tz );
	ASSERT_SAME_IMG_SIZE( convolved_hog, res );
	int P = patches->ty;
	assert( res->tz == P );
	int threadP = 1 + (P-1) / n_thread; // how many patches per thread
	int npix = (int)IMG_SIZE(convolved_hog);

	int l;
	#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
	#pragma omp parallel for num_threads(n_thread)
	#else
	n_thread = 1; // BLAS is already multithreaded
	threadP = P;
	#endif
	for(l=0; l<n_thread; l++) {
	// we do dotprod( patches[lthreadP : (l+1)threadP], convolved_hog )
	long start = l*threadP;
	long end = MIN(P,(l+1)*threadP);
	int np = int(end - start);
	float* p = patches->pixels + nh*start;
	float* r = res->pixels + npix*start;

	// blas fast matrix-matrix product
	char T='n'; float alpha = 1, beta = 0;
	sgemm_( &T, &T, &npix, &np, &nh, &alpha,
	convolved_hog->pixels, &npix,
	p, &nh, &beta, r, &npix);
	}
	}

	inline void transpose_scalar_block(const float A, float B, const int lda, const int ldb,
	const int block_row, const int block_col) {
	for(int i=0; i<block_row; i++)
	for(int j=0; j<block_col; j++)
	B[jldb + i] = A[ilda +j];
	}

	// Transpose A (N rows by M cols) into B (M by N)
	void transpose_matrix(const float_image* A, float_image* B, int nt) {
	const int n = A->ty, m = A->tx;
	assert( n==B->tx && m==B->ty );
	const int block_size = 16;
	const float* pA = A->pixels;
	float* pB = B->pixels;

	#ifdef USE_OPENMP
	#pragma omp parallel for num_threads(nt)
	#endif
	for(int i=0; i<n; i+=block_size)
	for(int j=0; j<m; j+=block_size)
	transpose_scalar_block(&pA[im +j], &pB[jn + i], m, n, MIN(block_size, n-i), MIN(block_size, m-j));
	}

	extern "C" {
	int sgemv_(char transa, integer m, integer * n,
	float alpha, float a, integer *lda,
	float b, integer ldb, float *beta,
	float c, integer ldc);
	}

	/* convolution of each patch within a local neighborhood
	ngh_rad = max translation
	neighborhood has size 2*ngh_rad
	patch at (x,y) is compared to patches in [y-ngh_rad : y+ngh_rad,
	x-ngh_rad : y+ngh_rad]
	*/
	void _dotprod_ngh_rad_T( int_cube* grid, float_image* patches, int ngh_rad,
	float_cube* convolved_hog, float_layers* res_out,
	int_image* offsets, int n_thread ) {
	int nh = patches->tx;
	assert( nh == convolved_hog->tz );
	const int P = patches->ty;
	assert( IMG_SIZE(grid)==P && grid->tz==2 );
	const int tx = convolved_hog->tx;
	const int ty = convolved_hog->ty;

	// neighborhood size
	int res_tx = MIN(tx,2*ngh_rad);
	int res_ty = MIN(ty,2*ngh_rad);
	assert(res_tx<tx-1 \|\| res_ty<ty-1 \|\| !"ngh_rad is too large and results in loss of perf. Set ngh_rad=0 instead.");
	int res_npix = res_tx * res_ty;
	// allocate result
	*res_out = empty_layers(float, res_tx, res_ty, P);
	assert(res_out->pixels \|\| !"error: ran out of memory before sgemm");
	*offsets = empty_image(int, 2, P);

	char T='t'; float alpha=1, beta=0; int one=1;

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(int j=0; j<res_ty; ++j) {
	// By organizing loops this way,
	// we exploit overlap between patches.

	for(int l=0; l<P; l++) {
	float* p = patches->pixels + l*nh;
	float* r = res_out->pixels + l*res_npix;
	int left = MAX(0, MIN(grid->pixels[2l+0] - ngh_rad, tx-2ngh_rad));
	int top = MAX(0, MIN(grid->pixels[2l+1] - ngh_rad, ty-2ngh_rad));
	if(j==0) {
	offsets->pixels[2*l+0] = left;
	offsets->pixels[2*l+1] = top;
	}
	float* c = convolved_hog->pixels + (left + toptx)nh;

	// blas fast matrix-vector product
	sgemv_( &T, &nh, &res_tx, &alpha, c + jtxnh, &nh,
	p, &one, &beta, r + j*res_tx, &one);
	}
	}
	}

	/* correct the convolution on the boundaries of the image
	ttx, tty: true shape of the res_map (in case of using offsets)
	*/
	void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets,
	const int ttx, const int tty, int extend, float_layers* res, int n_thread ) {
	const int n_patches = patches->ty;
	assert( n_patches == res->tz );
	//const int nori = patches->tx/pow2(patch_size);
	assert( patches->tx >= nori*pow2(patch_size) );
	const int tx = res->tx; // real true shape because it has been extended
	const int ty = res->ty;
	const int first_half = patch_size/2;
	const int second_half = patch_size - first_half; // in case patch_size is odd
	assert( offsets \|\| (ttx==tx && tty==ty) );
	assert( !offsets \|\| (ttx>=tx && tty>=ty) );
	assert( !offsets \|\| (offsets->ty==res->tz && offsets->tx==2) );
	const long npix = IMG_SIZE(res);

	int l;
	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(l=0; l<n_patches; l++) {
	// load offsets
	const int offi = offsets ? offsets->pixels[2*l+0] : 0;
	const int offj = offsets ? offsets->pixels[2*l+1] : 0;

	float sums[8]; // temporary norm of columns or rows
	assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) );
	int o,i,j;

	// horizontal boundaries
	memset(sums,0,sizeof(sums));
	float* p = patches->pixels + l*patches->tx;
	for(o=0; o<nori; o++)
	for(j=0; j<patch_size; j++)
	for(i=0; i<patch_size; i++)
	sums[j] += pow2(*p++);

	float old_norm = sqrt(sum_array_f(sums,patch_size));
	if( old_norm==0 ) continue;

	// upper boundary
	for(j=offj; j<first_half; j++) {
	float new_norm = sqrt(sum_array_f(sums+(first_half-j),second_half+j)); // sums to patch_size
	float mul = old_norm / (new_norm + 1e-8);
	float* r = res->pixels + lnpix + (j-offj)tx;
	for(i=0; i<tx; i++) {
	r[i] *= mul;
	//assert(r[i]<1.1);
	}
	}
	// lower boundary
	for(j=tty-extend+1-second_half; j<offj+ty; j++) {
	float new_norm = sqrt(sum_array_f(sums,first_half+tty-extend-j)); // sums to patch_size
	float mul = old_norm / (new_norm + 1e-8);
	float* r = res->pixels + lnpix + (j-offj)tx;
	for(i=0; i<tx; i++) {
	r[i] *= mul;
	//assert(r[i]<1.1);
	}
	}

	// vertical boundaries
	memset(sums,0,sizeof(sums));
	p = patches->pixels + l*patches->tx;
	for(o=0; o<nori; o++)
	for(j=0; j<patch_size; j++)
	for(i=0; i<patch_size; i++)
	sums[i] += pow2(*p++);

	// left boundary
	for(i=offi; i<first_half; i++) {
	float new_norm = sqrt(sum_array_f(sums+(first_half-i),second_half+i));
	float mul = old_norm / (new_norm + 1e-8);
	float* r = res->pixels + l*npix + (i-offi);
	for(j=0; j<ty; j++) {
	r[jtx] = mul;
	//assert(r[j*tx]<1.1);
	}
	}
	// right boundary
	for(i=ttx-extend+1-second_half; i<offi+tx; i++) {
	float new_norm = sqrt(sum_array_f(sums,first_half+ttx-extend-i));
	float mul = old_norm / (new_norm + 1e-8);
	float* r = res->pixels + l*npix + (i-offi);
	for(j=0; j<ty; j++) {
	r[jtx] = mul;
	//assert(r[j*tx]<1.1);
	}
	}

	// because we over-estimated the rectification for the corners, check that they do not overpass old_norm
	float* r = res->pixels + l*npix;
	for(j=offj; j<first_half; j++) {
	for(i=offi; i<first_half; i++)
	r[(j-offj)tx+(i-offi)] = MIN(r[(j-offj)tx+(i-offi)], old_norm);
	for(i=ttx-extend+1-second_half; i<offi+tx; i++)
	r[(j-offj)tx+(i-offi)] = MIN(r[(j-offj)tx+(i-offi)], old_norm);
	}
	for(j=tty-extend+1-second_half; j<offj+ty; j++) {
	for(i=offi; i<first_half; i++)
	r[(j-offj)tx+(i-offi)] = MIN(r[(j-offj)tx+(i-offi)], old_norm);
	for(i=ttx-extend+1-second_half; i<offi+tx; i++)
	r[(j-offj)tx+(i-offi)] = MIN(r[(j-offj)tx+(i-offi)], old_norm);
	}
	}
	}


	/* Compute the correlation of all patches with the second image (hog).
	In case of ngh_rad>0, the correlation is only computed in a small local neighborhood
	(whose size is parameterized by ngh_rad).
	if extend: width and height of output maps are extended
	if norm: correlation are normalized afterwards.
	*/
	void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad,
	int extend, float norm, int nt, res_scale* res ) {

	assert(0<=extend and extend<=1);
	float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt );
	assert( patches->tx==convolved_hog->tz);
	res->true_shape[0] = convolved_hog->tx;
	res->true_shape[1] = convolved_hog->ty;
	//hash_layers(convolved_hog)

	int_image* offsets = NULL;
	if( ngh_rad == 0 ) { // no limit on translation
	// allocate result
	res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty);
	assert(res->res_map.pixels \|\| !"error: ran out of memory before sgemm");

	// multi-threaded fast matrix product
	_dotprod( patches, convolved_hog, &res->res_map, nt );

	} else { // ngh_rad>0: cropping res_map
	offsets = &res->offsets;

	// transpose hog: _dotprod is much faster this way
	float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz);
	{ float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy
	float_image B = reshape_z_xy(float, &convolved_hog_T);
	transpose_matrix( &A, &B, nt);
	}
	//hash_cube(&convolved_hog_T)

	// resized grid
	int_cube fgrid = cube_like(int, &res->grid);
	for(int i=0; i<CUBE_SIZE(&fgrid); i++)
	fgrid.pixels[i] = res->grid.pixels[i]/res->f;
	//hash_cube(&fgrid)

	// multi-threaded fast matrix product
	_dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt );

	free(fgrid.pixels);
	free(convolved_hog_T.pixels);
	//hash_image(offsets)
	}
	free_layers(convolved_hog);

	// correct border effects on the correlation maps
	rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1],
	extend, &res->res_map, nt );
	}



	/* Compute: arr **= p
	*/
	void fastipow( float_layers* arr, const float p, int n_thread ) {
	const int n_layers = arr->tz;
	const long npix = arr->tx*arr->ty;
	int l;

	// optimization: precompute some values of pow(x,p)
	const int npc = 64;
	float precom[npc+1];
	for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p);
	const float maxindex = npc - 0.001;

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(l=0; l<n_layers; l++) {
	float* a = arr->pixels + l*npix;
	int i;
	for(i=0; i<npix; i++) {
	// arr[i] = pow(arr[i],p);
	float v = a[i]*npc;
	assert( v>=0 && v<npc+1 );
	if(v>maxindex) v=maxindex;
	int n = int(v);
	float w = v-n;
	a[i] = (1-w)precom[n] + wprecom[n+1];
	}
	}
	}

	/* Compute: arr = max(0,(arr-p)/(1-p))
	*/
	void fasthinge( float_layers* arr, const float p, int n_thread ) {
	const int n_layers = arr->tz;
	const long npix = arr->tx*arr->ty;
	int l;
	const float f = 1/(1-p);

	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(l=0; l<n_layers; l++) {
	float* a = arr->pixels + l*npix;
	int i;
	for(i=0; i<npix; i++) {
	float v = a[i];
	a[i] = MAX(0,f*(v-p));
	}
	}
	}

	inline int max_array_i(const int* a, int n) {
	int i=n;
	int res = INT_MIN;
	while(i--) if(a[i]>res) res=a[i];
	return res;
	}

	/* Normalize weights in border areas of width <gap>.
	There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
	sum_divf indicates the current weight in those areas, i.e. values in the area
	should be divided by the weight. But trans_inv allow to control the amount of
	normalization: 0=no normalization, 1=normal
	*/
	static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap,
	const float trans_inv, float sum_divf[9] ) {
	if( trans_inv == 0 ) return;
	int i,j;
	for(i=0; i<9; i++) {
	if( sum_divf[i]>0 )
	sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect
	}
	for(j=0; j<gap; j++) {
	if(sum_divf[0])
	for(i=0; i<gap; i++)
	rmap[jtx+i] = sum_divf[0];
	if(sum_divf[1])
	for(i=gap; i<tx-gap; i++)
	rmap[jtx+i] = sum_divf[1];
	if(sum_divf[2])
	for(i=tx-gap; i<tx; i++)
	rmap[jtx+i] = sum_divf[2];
	}
	for(; j<ty-gap; j++) {
	if(sum_divf[3])
	for(i=0; i<gap; i++)
	rmap[jtx+i] = sum_divf[3];
	if(sum_divf[5])
	for(i=tx-gap; i<tx; i++)
	rmap[jtx+i] = sum_divf[5];
	}
	for(; j<ty; j++) {
	if(sum_divf[6])
	for(i=0; i<gap; i++)
	rmap[jtx+i] = sum_divf[6];
	if(sum_divf[7])
	for(i=gap; i<tx-gap; i++)
	rmap[jtx+i] = sum_divf[7];
	if(sum_divf[8])
	for(i=tx-gap; i<tx; i++)
	rmap[jtx+i] = sum_divf[8];
	}
	}



	/* Compute the (sparse) convolutions specified by <children> on <map> and put the result in <res>.
	A standard order is assumed on the children:
	a response map #p is built from the children[p] at positions
	[(gapdx,gapdy) for dy in dys for dx in dxs]
	where dxs = [-1,1] or [-1,0,1]
	dys = [-1,1] or [-1,0,1]
	child_assign denote assignement of the children level, while assign is for the next level
	child_norms contain the norms of small patches and norms for big new cells
	*/
	int _sparse_conv( int_image* children, int_array* child_assign, int gap, float trans_inv,
	float_layers* child_map, int_image* offsets, float_array* child_norms, float_array* norms,
	int_array* assign, float_layers* res, int_image* res_offsets, int n_thread ) {
	const int nconv = children->ty; // number of convolutions to perform
	const int nc2 = children->tx;
	const int nc = sqrt(nc2);
	assert( nc*nc == nc2 );
	assert( res->tz == nconv );
	const int tx = child_map->tx;
	const int ty = child_map->ty;
	const long npix = tx*ty;
	ASSERT_SAME_SIZE( child_map, res );
	const int n_lower_conv = max_array_i(children->pixels,nconv*nc2)+1;
	int* cass = child_assign ? child_assign->pixels : NEWA(int,n_lower_conv);
	if(!child_assign) {for(int i=0; i<n_lower_conv; i++) cass[i]=i;}
	assert( !offsets \|\| (offsets->pixels && offsets->tx==2 && offsets->ty==n_lower_conv &&
	res_offsets && res_offsets->tx==2 && res_offsets->ty==nconv) );

	if(assign) {
	assert(0); // not supposed to happen
	} else {
	// normal case: no redundancy to exploit in response maps

	int l;
	#if defined(USE_OPENMP)
	#pragma omp parallel for num_threads(n_thread)
	#endif
	for(l=0; l<nconv; l++) {
	float rmap = res->pixels + lnpix;

	int u,v,c,ncall=0; // children number
	const int* const child = children->pixels + l*nc2;

	float sum_divf[9];
	memset(sum_divf,0,sizeof(sum_divf));
	int i,j;

	// first, choose an offset for the result rmap from the child offsets
	int offx=0, offy=0;
	if( offsets ) {
	int sum_ox=0, sum_oy=0, w=0;
	for(c=v=0; v<nc; v++) {
	int dy = (2*v/(nc-1)-1);
	for(u=0; u<nc; u++,c++) {
	int dx = (2*u/(nc-1)-1);

	if(child[c]<0 \|\| cass[child[c]]<0) continue;

	sum_ox += offsets->pixels[2child[c]+0] - dxgap;
	sum_oy += offsets->pixels[2child[c]+1] - dygap;
	w++;
	}
	}
	if(w==0) w++; // just in case
	offx = (int)floor(0.5 + sum_ox/float(w));
	offy = (int)floor(0.5 + sum_oy/float(w));

	// store result for later
	res_offsets->pixels[2*l+0] = offx;
	res_offsets->pixels[2*l+1] = offy;
	}

	for(c=v=0; v<nc; v++) {
	int dy = (2*v/(nc-1)-1);
	for(u=0; u<nc; u++,c++) {
	int dx = (2*u/(nc-1)-1);

	if(child[c]<0 \|\| cass[child[c]]<0) continue;
	float divf = child_norms->pixels[child[c]]/norms->pixels[l];

	// difference with rmap's offset
	const int trans_x = dxgap + (offsets? offx - offsets->pixels[2child[c]+0] : 0);
	const int trans_y = dygap + (offsets? offy - offsets->pixels[2child[c]+1] : 0);

	// count the sum of weights in every image area
	for(i=-1; i<=1; i++)for(j=-1; j<=1; j++)
	if(itrans_x<=0 && jtrans_y<=0)
	sum_divf[4+j*3+i] += divf;

	// add a translated version of map[children[c]] by (ox-dx,oy-dy)
	if(ncall++==0) // first call
	fast_set_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
	else
	fast_add_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
	}
	}

	if( ncall == 0) // default = zeros
	memset(rmap, 0, npix*sizeof(float));

	// now we are supposed to rectify the boundaries (to perfect convolution)
	normalize_trans(tx, ty, gap, rmap, trans_inv, sum_divf );

	//assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001);
	}
	}
	if(!child_assign) free(cass);

	#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
	max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)
	//CHECK_MAPS(res);

	return nconv;
	}

conv.cppNo OneTemporaryActions

File Metadata

conv.cppView Options

Event Timeline

conv.cpp
No OneTemporary
Actions

conv.cpp
View Options