Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88386302
cudpp_maximal_launch.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Oct 18, 13:04
Size
3 KB
Mime Type
text/x-c
Expires
Sun, Oct 20, 13:04 (2 d)
Engine
blob
Format
Raw Data
Handle
21763456
Attached To
rLAMMPS lammps
cudpp_maximal_launch.cpp
View Options
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_maximal_launch.h"
inline
size_t
min
(
size_t
x
,
size_t
y
)
{
return
(
x
<=
y
)
?
x
:
y
;
}
inline
size_t
max
(
size_t
x
,
size_t
y
)
{
return
(
x
>=
y
)
?
x
:
y
;
}
// computes next highest multiple of f from x
inline
size_t
multiple
(
size_t
x
,
size_t
f
)
{
return
((
x
+
(
f
-
1
))
/
f
);
}
// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline
size_t
ceiling
(
size_t
x
,
size_t
f
)
{
return
multiple
(
x
,
f
)
*
f
;
}
extern
"C"
size_t
maxBlocks
(
cudaFuncAttributes
&
attribs
,
cudaDeviceProp
&
devprop
,
size_t
bytesDynamicSharedMem
,
size_t
threadsPerBlock
)
{
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
const
unsigned
int
regAllocationUnit
=
(
devprop
.
major
<
2
&&
devprop
.
minor
<
2
)
?
256
:
512
;
// in registers
const
unsigned
int
warpAllocationMultiple
=
2
;
const
unsigned
int
smemAllocationUnit
=
512
;
// in bytes
const
unsigned
int
maxThreadsPerSM
=
(
devprop
.
major
<
2
&&
devprop
.
minor
<
2
)
?
768
:
1024
;
// sm_12 GPUs increase threads/SM to 1024
const
unsigned
int
maxBlocksPerSM
=
8
;
// Number of warps (round up to nearest whole multiple of warp size)
size_t
numWarps
=
multiple
(
threadsPerBlock
,
devprop
.
warpSize
);
// Round up to warp allocation multiple
numWarps
=
ceiling
(
numWarps
,
warpAllocationMultiple
);
// Number of regs is regs per thread times number of warps times warp size
size_t
regsPerCTA
=
attribs
.
numRegs
*
devprop
.
warpSize
*
numWarps
;
// Round up to multiple of register allocation unit size
regsPerCTA
=
ceiling
(
regsPerCTA
,
regAllocationUnit
);
size_t
smemBytes
=
attribs
.
sharedSizeBytes
+
bytesDynamicSharedMem
;
size_t
smemPerCTA
=
ceiling
(
smemBytes
,
smemAllocationUnit
);
size_t
ctaLimitRegs
=
regsPerCTA
>
0
?
devprop
.
regsPerBlock
/
regsPerCTA
:
maxBlocksPerSM
;
size_t
ctaLimitSMem
=
smemPerCTA
>
0
?
devprop
.
sharedMemPerBlock
/
smemPerCTA
:
maxBlocksPerSM
;
size_t
ctaLimitThreads
=
maxThreadsPerSM
/
threadsPerBlock
;
return
devprop
.
multiProcessorCount
*
min
(
ctaLimitRegs
,
min
(
ctaLimitSMem
,
min
(
ctaLimitThreads
,
maxBlocksPerSM
)));
}
extern
"C"
size_t
maxBlocksFromPointer
(
void
*
kernel
,
size_t
bytesDynamicSharedMem
,
size_t
threadsPerBlock
)
{
cudaDeviceProp
devprop
;
int
deviceID
=
-
1
;
cudaError_t
err
=
cudaGetDevice
(
&
deviceID
);
if
(
err
==
cudaSuccess
)
{
err
=
cudaGetDeviceProperties
(
&
devprop
,
deviceID
);
if
(
err
!=
cudaSuccess
)
return
-
1
;
cudaFuncAttributes
attr
;
err
=
cudaFuncGetAttributes
(
&
attr
,
(
const
char
*
)
kernel
);
if
(
err
!=
cudaSuccess
)
return
-
1
;
return
maxBlocks
(
attr
,
devprop
,
bytesDynamicSharedMem
,
threadsPerBlock
);
}
return
-
1
;
}
Event Timeline
Log In to Comment