11#include <unordered_set>
30 static void check_nvrtc_error(nvrtcResult result,
31 const std::string &name) {
33 assert(result == NVRTC_SUCCESS && nvrtcGetErrorString(result));
43 static void check_error(CUresult result,
44 const std::string &name) {
47 cuGetErrorString(result, &error);
48 if (result != CUDA_SUCCESS) {
49 std::cerr << name <<
" " << std::string(error) << std::endl;
51 assert(result == CUDA_SUCCESS && error);
58 static CUresult cuda_init() {
59 const CUresult result = cuInit(0);
60 check_error(result,
"cuInit");
64 static const CUresult result = cuda_init();
72 template<jit::
float_scalar T,
bool SAFE_MATH=false>
82 std::map<graph::leaf_node<T, SAFE_MATH> *, CUdeviceptr> kernel_arguments;
83#ifdef USE_CUDA_TEXTURES
85 std::map<void *, CUtexObject> texture_arguments;
88 CUdeviceptr result_buffer;
90 CUdeviceptr offset_buffer;
100 void check_error_async(CUresult result,
101 const std::string &name) {
102 check_error(result, name);
104 std::string async_name = name +
"_async";
105 check_error(cuStreamSynchronize(stream), async_name);
123 check_error(cuDeviceGetCount(&count),
"cuDeviceGetCount");
139 cuda_context(
const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
140 check_error(cuDeviceGet(&device, index),
"cuDeviceGet");
141 check_error(cuDevicePrimaryCtxRetain(&context, device),
"cuDevicePrimaryCtxRetain");
142 check_error(cuCtxSetCurrent(context),
"cuCtxSetCurrent");
143 check_error(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1),
"cuCtxSetCacheConfig");
144 check_error(cuStreamCreate(&stream, CU_STREAM_DEFAULT),
"cuStreamCreate");
146 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
147 device),
"cuDeviceGetAttribute");
155 check_error(cuModuleUnload(module),
"cuModuleUnload");
159 for (
auto &[key, value] : kernel_arguments) {
160 check_error(cuMemFree(value),
"cuMemFree");
163#ifdef USE_CUDA_TEXTURES
164 for (
auto &[key, value] : texture_arguments) {
165 CUDA_RESOURCE_DESC resource;
166 check_error(cuTexObjectGetResourceDesc(&resource, value),
167 "cuTexObjectGetResourceDesc");
169 check_error(cuArrayDestroy(resource.res.array.hArray),
"cuArrayDestroy");
170 check_error(cuTexObjectDestroy(value),
"cuTexObjectDestroy");
175 check_error(cuMemFree(result_buffer),
"cuMemFree");
179 check_error(cuMemFree(offset_buffer),
"cuMemFree");
183 check_error(cuStreamDestroy(stream),
"cuStreamDestroy");
184 check_error(cuDevicePrimaryCtxRelease(device),
"cuDevicePrimaryCtxRelease");
195 std::vector<std::string> names,
196 const bool add_reduction=
false) {
198 names.push_back(
"max_reduction");
201 nvrtcProgram kernel_program;
202 check_nvrtc_error(nvrtcCreateProgram(&kernel_program,
203 kernel_source.c_str(),
204 NULL, 0, NULL, NULL),
205 "nvrtcCreateProgram");
207 for (std::string &name : names) {
208 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
210 "nvrtcAddNameExpression");
213 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
215 "nvrtcAddNameExpression");
218 std::ostringstream arch;
220 check_error(cuDeviceGetAttribute(&compute_version,
221 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
222 device),
"cuDeviceGetAttribute");
223 arch <<
"--gpu-architecture=compute_";
224 arch << compute_version;
226 std::cout <<
"CUDA GPU info." << std::endl;
227 std::cout <<
" Major compute capability : " << compute_version << std::endl;
230 check_error(cuDeviceGetAttribute(&compute_version,
231 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
232 device),
"cuDeviceGetAttribute");
233 arch << compute_version;
235 std::cout <<
" Minor compute capability : " << compute_version << std::endl;
238 char device_name[100];
239 check_error(cuDeviceGetName(device_name, 100, device),
"cuDeviceGetName");
241 std::cout <<
" Device name : " << device_name << std::endl;
244 const std::string temp = arch.str();
245 std::array<const char *, 8> options({
248 "--relocatable-device-code=false",
249 "--include-path=" CUDA_INCLUDE,
250 "--include-path=" HEADER_DIR,
251 "--extra-device-vectorization",
252 "--device-as-default-execution-space",
256 if (nvrtcCompileProgram(kernel_program, options.size(), options.data())) {
258 check_nvrtc_error(nvrtcGetProgramLogSize(kernel_program, &log_size),
259 "nvrtcGetProgramLogSize");
261 char *log =
static_cast<char *
> (malloc(log_size));
262 check_nvrtc_error(nvrtcGetProgramLog(kernel_program, log),
263 "nvrtcGetProgramLog");
264 std::cout << log << std::endl;
266 std::cout << kernel_source << std::endl;
269 check_error(cuDeviceGetAttribute(&compute_version,
270 CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
271 device),
"cuDeviceGetAttribute");
273 std::cout <<
" Managed Memory : " << compute_version << std::endl;
277 check_nvrtc_error(nvrtcGetPTXSize(kernel_program, &ptx_size),
280 char *ptx =
static_cast<char *
> (malloc(ptx_size));
281 check_nvrtc_error(nvrtcGetPTX(kernel_program, ptx),
"nvrtcGetPTX");
283 check_nvrtc_error(nvrtcDestroyProgram(&kernel_program),
284 "nvrtcDestroyProgram");
286 std::array<CUjit_option, 3> module_options = {
287 CU_JIT_MAX_REGISTERS,
289 CU_JIT_POSITION_INDEPENDENT_CODE
291 std::array<void *, 3> module_values = {
292 reinterpret_cast<void *
> (
MAX_REG),
293 reinterpret_cast<void *
> (1),
294 reinterpret_cast<void *
> (0)
297 check_error(cuModuleLoadDataEx(&module, ptx, module_options.size(),
298 module_options.data(),
299 module_values.data()),
"cuModuleLoadDataEx");
320 const size_t num_rays,
324 check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()),
"cuModuleGetFunction");
326 std::vector<void *> buffers;
328 const size_t buffer_element_size =
sizeof(T);
329 for (
auto &input : inputs) {
330 if (!kernel_arguments.contains(input.get())) {
331 kernel_arguments.try_emplace(input.get());
333 check_error(cuMemAllocManaged(&kernel_arguments[input.get()],
335 CU_MEM_ATTACH_GLOBAL),
336 "cuMemAllocManaged");
337 check_error(cuMemcpyHtoD(kernel_arguments[input.get()],
341 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[input.get()]));
344 for (
auto &
output : outputs) {
345 if (!kernel_arguments.contains(
output.get())) {
346 kernel_arguments.try_emplace(
output.get());
347 check_error(cuMemAllocManaged(&kernel_arguments[
output.get()],
349 CU_MEM_ATTACH_GLOBAL),
350 "cuMemAllocManaged");
351 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[
output.get()]));
355 const size_t num_buffers = buffers.size();
357 if (!kernel_arguments.contains(state.get())) {
358 kernel_arguments.try_emplace(state.get());
359 check_error(cuMemAllocManaged(&kernel_arguments[state.get()],
360 state->get_size_bytes(),
361 CU_MEM_ATTACH_GLOBAL),
362 "cuMemAllocManaged");
363 check_error(cuMemAlloc(&offset_buffer,
sizeof(uint32_t)),
"cuMemAlloc");
364 check_error(cuMemcpyHtoD(kernel_arguments[state.get()],
366 state->get_size_bytes()),
369 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[state.get()]));
370 buffers.push_back(
reinterpret_cast<void *
> (&offset_buffer));
373#ifdef USE_CUDA_TEXTURES
374 for (
auto &[data, size] : tex1d_list) {
375 if (!texture_arguments.contains(data)) {
376 texture_arguments.try_emplace(data);
377 CUDA_RESOURCE_DESC resource_desc;
378 CUDA_TEXTURE_DESC texture_desc;
379 CUDA_ARRAY_DESCRIPTOR array_desc;
381 array_desc.Width = size;
382 array_desc.Height = 1;
384 memset(&resource_desc, 0,
sizeof(CUDA_RESOURCE_DESC));
385 memset(&texture_desc, 0,
sizeof(CUDA_TEXTURE_DESC));
387 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
388 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
389 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
390 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
392 array_desc.Format = CU_AD_FORMAT_FLOAT;
394 array_desc.NumChannels = 2;
396 array_desc.NumChannels = 1;
399 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
401 array_desc.NumChannels = 4;
403 array_desc.NumChannels = 2;
406 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
408 check_error(cuMemcpyHtoA(resource_desc.res.array.hArray, 0, data,
409 size*
sizeof(
float)*array_desc.NumChannels),
412 check_error(cuTexObjectCreate(&texture_arguments[data],
413 &resource_desc, &texture_desc,
415 "cuTexObjectCreate");
417 buffers.push_back(
reinterpret_cast<void *
> (&texture_arguments[data]));
419 for (
auto &[data, size] : tex2d_list) {
420 if (!texture_arguments.contains(data)) {
421 texture_arguments.try_emplace(data);
422 CUDA_RESOURCE_DESC resource_desc;
423 CUDA_TEXTURE_DESC texture_desc;
424 CUDA_ARRAY_DESCRIPTOR array_desc;
426 array_desc.Width = size[0];
427 array_desc.Height = size[1];
429 memset(&resource_desc, 0,
sizeof(CUDA_RESOURCE_DESC));
430 memset(&texture_desc, 0,
sizeof(CUDA_TEXTURE_DESC));
432 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
433 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
434 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
435 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
436 const size_t total = size[0]*size[1];
438 array_desc.Format = CU_AD_FORMAT_FLOAT;
440 array_desc.NumChannels = 2;
442 array_desc.NumChannels = 1;
445 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
447 array_desc.NumChannels = 4;
449 array_desc.NumChannels = 2;
452 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
455 CUDA_MEMCPY2D copy_desc;
456 memset(©_desc, 0,
sizeof(copy_desc));
458 copy_desc.srcPitch = size[0]*
sizeof(float)*array_desc.NumChannels;
459 copy_desc.srcMemoryType = CU_MEMORYTYPE_HOST;
460 copy_desc.srcHost = data;
462 copy_desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
463 copy_desc.dstArray = resource_desc.res.array.hArray;
465 copy_desc.WidthInBytes = copy_desc.srcPitch;
466 copy_desc.Height = size[0];
468 check_error(cuMemcpy2D(©_desc),
"cuMemcpy2D");
470 check_error(cuTexObjectCreate(&texture_arguments[data],
471 &resource_desc, &texture_desc,
473 "cuTexObjectCreate");
475 buffers.push_back(
reinterpret_cast<void *
> (&texture_arguments[data]));
480 check_error(cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
481 function),
"cuFuncGetAttribute");
482 unsigned int threads_per_group = value;
483 unsigned int thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0);
486 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
487 "cuOccupancyMaxPotentialBlockSize");
490 std::cout <<
" Kernel name : " << kernel_name << std::endl;
491 std::cout <<
" Threads per group : " << threads_per_group << std::endl;
492 std::cout <<
" Number of groups : " << thread_groups << std::endl;
493 std::cout <<
" Total problem size : " << threads_per_group*thread_groups << std::endl;
494 std::cout <<
" Min grid size : " << min_grid << std::endl;
495 std::cout <<
" Suggested Block size : " << value << std::endl;
499 return [
this, num_rays, function, threads_per_group, buffers] ()
mutable {
500 for (uint32_t i = 0; i < num_rays; i += threads_per_group) {
501 check_error_async(cuStreamWriteValue32(stream, offset_buffer, i,
502 CU_STREAM_WRITE_VALUE_DEFAULT),
503 "cuStreamWriteValue32");
504 check_error_async(cuLaunchKernel(function,
506 threads_per_group, 1, 1,
508 buffers.data(), NULL),
513 return [
this, function, thread_groups, threads_per_group, buffers] ()
mutable {
514 check_error_async(cuLaunchKernel(function, thread_groups, 1, 1,
515 threads_per_group, 1, 1, 0, stream,
516 buffers.data(), NULL),
530 std::function<
void(
void)> run) {
531 check_error(cuMemAllocManaged(&result_buffer,
sizeof(T),
532 CU_MEM_ATTACH_GLOBAL),
533 "cuMemAllocManaged");
535 std::vector<void *> buffers;
537 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[argument.get()]));
538 buffers.push_back(
reinterpret_cast<void *
> (&result_buffer));
541 check_error(cuModuleGetFunction(&function, module,
"max_reduction"),
542 "cuModuleGetFunction");
546 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
547 "cuOccupancyMaxPotentialBlockSize");
550 std::cout <<
" Kernel name : max_reduction" << std::endl;
551 std::cout <<
" Min grid size : " << min_grid << std::endl;
552 std::cout <<
" Suggested Block size : " << value << std::endl;
555 return [
this, function, run, buffers] ()
mutable {
557 check_error_async(cuLaunchKernel(function, 1, 1, 1,
558 1024, 1, 1, 0, stream,
559 buffers.data(), NULL),
563 return reinterpret_cast<T *
> (result_buffer)[0];
571 check_error_async(cuStreamSynchronize(stream),
"cuStreamSynchronize");
572 check_error(cuCtxSynchronize(),
"cuCtxSynchronize");
584 for (
auto &out : nodes) {
585 const T temp =
reinterpret_cast<T *
> (kernel_arguments[out.get()])[index];
587 std::cout << std::real(temp) <<
" " << std::imag(temp) <<
" ";
589 std::cout << temp <<
" ";
592 std::cout << std::endl;
605 return reinterpret_cast<T *
> (kernel_arguments[node.get()])[index];
617 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]),
"cuMemGetAddressRange");
618 check_error_async(cuMemcpyHtoDAsync(kernel_arguments[node.get()], source, size, stream),
"cuMemcpyHtoDAsync");
630 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]),
"cuMemGetAddressRange");
631 check_error_async(cuMemcpyDtoHAsync(destination, kernel_arguments[node.get()], size, stream),
"cuMemcpyDtoHAsync");
640 source_buffer <<
"typedef unsigned int uint32_t;" << std::endl
641 <<
"typedef unsigned short uint16_t;" << std::endl
642 <<
"typedef short int16_t;" << std::endl
643 <<
"template<typename T, size_t S>" << std::endl
644 <<
"class array {" << std::endl
645 <<
"private:" << std::endl
646 <<
" T _buffer[S];" << std::endl
647 <<
"public:" << std::endl
648 <<
" T operator[] (const size_t index) const {" << std::endl
649 <<
" return _buffer[index];" << std::endl
651 <<
" T &operator[] (const size_t index) {" << std::endl
652 <<
" return _buffer[index];" << std::endl
654 <<
"};" << std::endl;
656 source_buffer <<
"#define CUDA_DEVICE_CODE" << std::endl
657 <<
"#define M_PI " << M_PI << std::endl
658 <<
"#include <cuda/std/complex>" << std::endl
659 <<
"#include <special_functions.hpp>" << std::endl;
660#ifdef USE_CUDA_TEXTURES
662 source_buffer <<
"static __inline__ __device__ complex<float> to_cmp_float(float2 p) {"
665 jit::add_type<T> (source_buffer);
666 source_buffer <<
" (p.x, p.y);" << std::endl
669 source_buffer <<
"static __inline__ __device__ complex<double> to_cmp_double(uint4 p) {"
672 jit::add_type<T> (source_buffer);
673 source_buffer <<
" (__hiloint2double(p.y, p.x), __hiloint2double(p.w, p.z));"
678 source_buffer <<
"static __inline__ __device__ double to_double(uint2 p) {"
680 <<
" return __hiloint2double(p.y, p.x);"
703 const std::string name,
708 const std::vector<bool> &is_constant,
713 source_buffer << std::endl;
714 source_buffer <<
"extern \"C\" __global__ void "
715 << name <<
"(" << std::endl;
717 std::unordered_set<void *> used_args;
719 source_buffer <<
" ";
720 if (is_constant[0]) {
721 source_buffer <<
"const ";
723 jit::add_type<T> (source_buffer);
724 source_buffer <<
" * __restrict__ "
726 used_args.insert(inputs[0].get());
728 for (
size_t i = 1, ie = inputs.size(); i < ie; i++) {
729 if (!used_args.contains(inputs[i].get())) {
730 source_buffer <<
", // " << inputs[i - 1]->get_symbol()
731#ifndef USE_INPUT_CACHE
733 <<
" used " << usage.at(inputs[i - 1].get())
737 source_buffer <<
" ";
738 if (is_constant[i]) {
739 source_buffer <<
"const ";
741 jit::add_type<T> (source_buffer);
742 source_buffer <<
" * __restrict__ "
744 used_args.insert(inputs[i].get());
747 for (
size_t i = 0, ie = outputs.size(); i < ie; i++) {
750 source_buffer <<
", // "
751 << inputs[inputs.size() - 1]->get_symbol();
752#ifndef USE_INPUT_CACHE
754 source_buffer <<
" used "
755 << usage.at(inputs[inputs.size() - 1].get());
758 source_buffer << std::endl;
761 source_buffer <<
"," << std::endl;
764 if (!used_args.contains(outputs[i].get())) {
765 source_buffer <<
" ";
766 jit::add_type<T> (source_buffer);
767 source_buffer <<
" * __restrict__ "
769 used_args.insert(outputs[i].get());
773 source_buffer <<
"," << std::endl
774 <<
" mt_state * __restrict__ "
777 <<
" const uint32_t * __restrict__ offset"
780#ifdef USE_CUDA_TEXTURES
781 for (
auto &[key, value] : textures1d) {
782 source_buffer <<
"," << std::endl;
783 source_buffer <<
" cudaTextureObject_t "
786 for (
auto &[key, value] : textures2d) {
787 source_buffer <<
"," << std::endl;
788 source_buffer <<
" cudaTextureObject_t "
792 source_buffer <<
") {" << std::endl
793 <<
" const int index = blockIdx.x*blockDim.x + threadIdx.x;"
796#ifdef USE_INPUT_CACHE
798 source_buffer <<
" mt_state &" << registers[state.get()] <<
" = "
802 <<
" // used " << usage.at(state.get())
806 registers[state.get()] =
jit::to_string(
's', state.get()) +
"[threadIdx.x]";
809 source_buffer <<
" if (";
811 source_buffer <<
"offset[0] + ";
813 source_buffer <<
"index < " << size <<
") {" << std::endl;
816 for (
auto &input : inputs) {
817#ifdef USE_INPUT_CACHE
818 if (usage.at(input.get())) {
820 source_buffer <<
" const ";
821 jit::add_type<T> (source_buffer);
822 source_buffer <<
" " << registers[input.get()] <<
" = "
826 source_buffer <<
"offset[0] + ";
828 source_buffer <<
"index]; // " << input->get_symbol()
830 <<
" used " << usage.at(input.get())
835 registers[input.get()] =
jit::to_string(
'v', input.get()) +
"[index]";
858 std::unordered_set<void *> out_registers;
859 for (
auto &[out, in] : setters) {
860 if (!out->is_match(in) &&
861 !out_registers.contains(out.get())) {
870 source_buffer <<
"offset[0] + ";
872 source_buffer <<
"index] = ";
873 if constexpr (SAFE_MATH) {
875 jit::add_type<T> (source_buffer);
876 source_buffer <<
" (";
877 source_buffer <<
"isnan(real(" << registers[a.get()]
878 <<
")) ? 0.0 : real("
879 << registers[a.get()]
881 source_buffer <<
"isnan(imag(" << registers[a.get()]
882 <<
")) ? 0.0 : imag("
883 << registers[a.get()]
884 <<
"));" << std::endl;
886 source_buffer <<
"isnan(" << registers[a.get()]
887 <<
") ? 0.0 : " << registers[a.get()]
891 source_buffer << registers[a.get()] <<
";" << std::endl;
893 out_registers.insert(out.get());
897 for (
auto &out : outputs) {
899 !out_registers.contains(out.get())) {
908 source_buffer <<
"offset[0] + ";
910 source_buffer <<
"index] = ";
911 if constexpr (SAFE_MATH) {
913 jit::add_type<T> (source_buffer);
914 source_buffer <<
" (";
915 source_buffer <<
"isnan(real(" << registers[a.get()]
916 <<
")) ? 0.0 : real("
917 << registers[a.get()]
919 source_buffer <<
"isnan(imag(" << registers[a.get()]
920 <<
")) ? 0.0 : imag("
921 << registers[a.get()]
922 <<
"));" << std::endl;
924 source_buffer <<
"isnan(" << registers[a.get()]
925 <<
") ? 0.0 : " << registers[a.get()]
929 source_buffer << registers[a.get()] <<
";" << std::endl;
931 out_registers.insert(out.get());
935 source_buffer <<
" }" << std::endl <<
"}" << std::endl;
946 source_buffer << std::endl;
947 source_buffer <<
"extern \"C\" __global__ void max_reduction(" << std::endl;
948 source_buffer <<
" const ";
949 jit::add_type<T> (source_buffer);
950 source_buffer <<
" * __restrict__ input," << std::endl;
951 source_buffer <<
" ";
952 jit::add_type<T> (source_buffer);
953 source_buffer <<
" * __restrict__ result) {" << std::endl;
954 source_buffer <<
" const unsigned int i = threadIdx.x;" << std::endl;
955 source_buffer <<
" const unsigned int j = threadIdx.x/32;" << std::endl;
956 source_buffer <<
" const unsigned int k = threadIdx.x%32;" << std::endl;
957 source_buffer <<
" if (i < " << size <<
") {" << std::endl;
958 source_buffer <<
" " << jit::type_to_string<T> () <<
" sub_max = ";
960 source_buffer <<
"abs(input[i]);" << std::endl;
962 source_buffer <<
"input[i];" << std::endl;
964 source_buffer <<
" for (size_t index = i + 1024; index < " << size <<
"; index += 1024) {" << std::endl;
966 source_buffer <<
" sub_max = max(sub_max, abs(input[index]));" << std::endl;
968 source_buffer <<
" sub_max = max(sub_max, input[index]);" << std::endl;
970 source_buffer <<
" }" << std::endl;
971 source_buffer <<
" __shared__ " << jit::type_to_string<T> () <<
" thread_max[32];" << std::endl;
972 source_buffer <<
" for (int index = 16; index > 0; index /= 2) {" << std::endl;
973 source_buffer <<
" sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
974 source_buffer <<
" }" << std::endl;
975 source_buffer <<
" thread_max[j] = sub_max;" << std::endl;
976 source_buffer <<
" __syncthreads();" << std::endl;
977 source_buffer <<
" if (j == 0) {" << std::endl;
978 source_buffer <<
" for (int index = 16; index > 0; index /= 2) {" << std::endl;
979 source_buffer <<
" thread_max[k] = max(thread_max[k], __shfl_down_sync(__activemask(), thread_max[k], index));" << std::endl;
980 source_buffer <<
" }" << std::endl;
981 source_buffer <<
" *result = thread_max[0];" << std::endl;
982 source_buffer <<
" }" << std::endl;
983 source_buffer <<
" }" << std::endl;
984 source_buffer <<
"}" << std::endl << std::endl;
993 return reinterpret_cast<T *
> (kernel_arguments[node.get()]);
Class representing a generic buffer.
Definition backend.hpp:29
Class representing a cuda gpu context.
Definition cuda_context.hpp:73
T * get_buffer(graph::shared_leaf< T, SAFE_MATH > &node)
Get the buffer for a node.
Definition cuda_context.hpp:992
~cuda_context()
Cuda context destructor.
Definition cuda_context.hpp:153
int remaining_const_memory
Remaining constant memory in bytes.
Definition cuda_context.hpp:114
void copy_to_host(graph::shared_leaf< T, SAFE_MATH > node, T *destination)
Copy buffer contents to host.
Definition cuda_context.hpp:627
void create_kernel_postfix(std::ostringstream &source_buffer, graph::output_nodes< T, SAFE_MATH > &outputs, graph::map_nodes< T, SAFE_MATH > &setters, graph::shared_random_state< T, SAFE_MATH > state, jit::register_map ®isters, jit::register_map &indices, const jit::register_usage &usage)
Create kernel postfix.
Definition cuda_context.hpp:851
void create_reduction(std::ostringstream &source_buffer, const size_t size)
Create reduction.
Definition cuda_context.hpp:944
std::function< T(void)> create_max_call(graph::shared_leaf< T, SAFE_MATH > &argument, std::function< void(void)> run)
Create a max compute kernel calling function.
Definition cuda_context.hpp:529
std::function< void(void)> create_kernel_call(const std::string kernel_name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t num_rays, const jit::texture1d_list &tex1d_list, const jit::texture2d_list &tex2d_list)
Create a kernel calling function.
Definition cuda_context.hpp:316
cuda_context(const size_t index)
Cuda context constructor.
Definition cuda_context.hpp:139
static size_t max_concurrency()
Get the maximum number of concurrent instances.
Definition cuda_context.hpp:121
void copy_to_device(graph::shared_leaf< T, SAFE_MATH > node, T *source)
Copy buffer contents to the device.
Definition cuda_context.hpp:614
static std::string device_type()
Device discription.
Definition cuda_context.hpp:130
void print_results(const size_t index, const graph::output_nodes< T, SAFE_MATH > &nodes)
Print out the results.
Definition cuda_context.hpp:581
static constexpr size_t random_state_size
Size of random state needed.
Definition cuda_context.hpp:111
void create_kernel_prefix(std::ostringstream &source_buffer, const std::string name, graph::input_nodes< T, SAFE_MATH > &inputs, graph::output_nodes< T, SAFE_MATH > &outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t size, const std::vector< bool > &is_constant, jit::register_map ®isters, const jit::register_usage &usage, jit::texture1d_list &textures1d, jit::texture2d_list &textures2d)
Create kernel prefix.
Definition cuda_context.hpp:702
T check_value(const size_t index, const graph::shared_leaf< T, SAFE_MATH > &node)
Check the value.
Definition cuda_context.hpp:602
void wait()
Hold the current thread until the stream has completed.
Definition cuda_context.hpp:570
void create_header(std::ostringstream &source_buffer)
Create the source header.
Definition cuda_context.hpp:639
void compile(const std::string kernel_source, std::vector< std::string > names, const bool add_reduction=false)
Compile the kernels.
Definition cuda_context.hpp:194
Complex scalar concept.
Definition register.hpp:24
Double base concept.
Definition register.hpp:42
float base concept.
Definition register.hpp:37
#define MAX_REG
Maximum number of registers to use.
Definition cuda_context.hpp:21
subroutine assert(test, message)
Assert check.
Definition f_binding_test.f90:38
Name space for backend buffers.
Definition backend.hpp:19
Name space for GPU backends.
Definition cpu_context.hpp:51
std::shared_ptr< random_state_node< T, SAFE_MATH > > shared_random_state
Convenience type alias for shared sqrt nodes.
Definition random.hpp:272
std::vector< shared_variable< T, SAFE_MATH > > input_nodes
Convenience type alias for a vector of inputs.
Definition node.hpp:1730
shared_variable< T, SAFE_MATH > variable_cast(shared_leaf< T, SAFE_MATH > x)
Cast to a variable node.
Definition node.hpp:1746
std::shared_ptr< leaf_node< T, SAFE_MATH > > shared_leaf
Convenience type alias for shared leaf nodes.
Definition node.hpp:673
std::vector< std::pair< shared_leaf< T, SAFE_MATH >, shared_variable< T, SAFE_MATH > > > map_nodes
Convenience type alias for maping end codes back to inputs.
Definition node.hpp:1734
std::vector< shared_leaf< T, SAFE_MATH > > output_nodes
Convenience type alias for a vector of output nodes.
Definition node.hpp:688
std::map< void *, size_t > texture1d_list
Type alias for indexing 1D textures.
Definition register.hpp:262
std::map< void *, std::array< size_t, 2 > > texture2d_list
Type alias for indexing 2D textures.
Definition register.hpp:264
std::map< void *, size_t > register_usage
Type alias for counting register usage.
Definition register.hpp:258
std::map< void *, std::string > register_map
Type alias for mapping node pointers to register names.
Definition register.hpp:256
std::string to_string(const char prefix, const NODE *pointer)
Convert a graph::leaf_node pointer to a string.
Definition register.hpp:245
Name space for output files.
Definition output.hpp:16
Random constants and distributions.