11#include <unordered_set>
30 static void check_nvrtc_error(nvrtcResult result,
31 const std::string &name) {
33 assert(result == NVRTC_SUCCESS && nvrtcGetErrorString(result));
43 static void check_error(CUresult result,
44 const std::string &name) {
47 cuGetErrorString(result, &error);
48 if (result != CUDA_SUCCESS) {
49 std::cerr << name <<
" " << std::string(error) << std::endl;
51 assert(result == CUDA_SUCCESS && error);
58 static CUresult cuda_init() {
59 const CUresult result = cuInit(0);
60 check_error(result,
"cuInit");
64 static const CUresult result = cuda_init();
72 template<jit::
float_scalar T,
bool SAFE_MATH=false>
82 std::map<graph::leaf_node<T, SAFE_MATH> *, CUdeviceptr> kernel_arguments;
83#ifdef USE_CUDA_TEXTURES
85 std::map<void *, CUtexObject> texture_arguments;
88 CUdeviceptr result_buffer;
90 CUdeviceptr offset_buffer;
100 void check_error_async(CUresult result,
101 const std::string &name) {
102 check_error(result, name);
104 std::string async_name = name +
"_async";
105 check_error(cuStreamSynchronize(stream), async_name);
123 check_error(cuDeviceGetCount(&count),
"cuDeviceGetCount");
139 cuda_context(
const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
140 check_error(cuDeviceGet(&device, index),
"cuDeviceGet");
141 check_error(cuDevicePrimaryCtxRetain(&context, device),
"cuDevicePrimaryCtxRetain");
142 check_error(cuCtxSetCurrent(context),
"cuCtxSetCurrent");
143 check_error(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1),
"cuCtxSetCacheConfig");
144 check_error(cuStreamCreate(&stream, CU_STREAM_DEFAULT),
"cuStreamCreate");
146 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
147 device),
"cuDeviceGetAttribute");
155 check_error(cuModuleUnload(module),
"cuModuleUnload");
159 for (
auto &[key, value] : kernel_arguments) {
160 check_error(cuMemFree(value),
"cuMemFree");
163#ifdef USE_CUDA_TEXTURES
164 for (
auto &[key, value] : texture_arguments) {
165 CUDA_RESOURCE_DESC resource;
166 check_error(cuTexObjectGetResourceDesc(&resource, value),
167 "cuTexObjectGetResourceDesc");
169 check_error(cuArrayDestroy(resource.res.array.hArray),
"cuArrayDestroy");
170 check_error(cuTexObjectDestroy(value),
"cuTexObjectDestroy");
175 check_error(cuMemFree(result_buffer),
"cuMemFree");
179 check_error(cuMemFree(offset_buffer),
"cuMemFree");
183 check_error(cuStreamDestroy(stream),
"cuStreamDestroy");
184 check_error(cuDevicePrimaryCtxRelease(device),
"cuDevicePrimaryCtxRelease");
195 std::vector<std::string> names,
196 const bool add_reduction=
false) {
198 names.push_back(
"max_reduction");
201 nvrtcProgram kernel_program;
202 check_nvrtc_error(nvrtcCreateProgram(&kernel_program,
203 kernel_source.c_str(),
204 NULL, 0, NULL, NULL),
205 "nvrtcCreateProgram");
207 for (std::string &name : names) {
208 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
210 "nvrtcAddNameExpression");
213 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
215 "nvrtcAddNameExpression");
218 std::ostringstream arch;
220 check_error(cuDeviceGetAttribute(&compute_version,
221 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
222 device),
"cuDeviceGetAttribute");
223 arch <<
"--gpu-architecture=compute_";
224 arch << compute_version;
226 std::cout <<
"CUDA GPU info." << std::endl;
227 std::cout <<
" Major compute capability : " << compute_version << std::endl;
230 check_error(cuDeviceGetAttribute(&compute_version,
231 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
232 device),
"cuDeviceGetAttribute");
233 arch << compute_version;
235 std::cout <<
" Minor compute capability : " << compute_version << std::endl;
238 char device_name[100];
239 check_error(cuDeviceGetName(device_name, 100, device),
"cuDeviceGetName");
241 std::cout <<
" Device name : " << device_name << std::endl;
244 const std::string temp = arch.str();
245 std::array<const char *, 8> options({
248 "--relocatable-device-code=false",
249 "--include-path=" CUDA_INCLUDE,
250 "--include-path=" HEADER_DIR,
251 "--extra-device-vectorization",
252 "--device-as-default-execution-space",
256 if (nvrtcCompileProgram(kernel_program, options.size(), options.data())) {
258 check_nvrtc_error(nvrtcGetProgramLogSize(kernel_program, &log_size),
259 "nvrtcGetProgramLogSize");
261 char *log =
static_cast<char *
> (malloc(log_size));
262 check_nvrtc_error(nvrtcGetProgramLog(kernel_program, log),
263 "nvrtcGetProgramLog");
264 std::cout << log << std::endl;
266 std::cout << kernel_source << std::endl;
269 check_error(cuDeviceGetAttribute(&compute_version,
270 CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
271 device),
"cuDeviceGetAttribute");
273 std::cout <<
" Managed Memory : " << compute_version << std::endl;
277 check_nvrtc_error(nvrtcGetPTXSize(kernel_program, &ptx_size),
280 char *ptx =
static_cast<char *
> (malloc(ptx_size));
281 check_nvrtc_error(nvrtcGetPTX(kernel_program, ptx),
"nvrtcGetPTX");
283 check_nvrtc_error(nvrtcDestroyProgram(&kernel_program),
284 "nvrtcDestroyProgram");
286 std::array<CUjit_option, 3> module_options = {
287 CU_JIT_MAX_REGISTERS,
289 CU_JIT_POSITION_INDEPENDENT_CODE
291 std::array<void *, 3> module_values = {
292 reinterpret_cast<void *
> (
MAX_REG),
293 reinterpret_cast<void *
> (1),
294 reinterpret_cast<void *
> (0)
297 check_error(cuModuleLoadDataEx(&module, ptx, module_options.size(),
298 module_options.data(),
299 module_values.data()),
"cuModuleLoadDataEx");
320 const size_t num_rays,
324 check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()),
"cuModuleGetFunction");
326 std::vector<void *> buffers;
327 std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers;
329 const size_t buffer_element_size =
sizeof(T);
330 for (
auto &input : inputs) {
331 if (!kernel_arguments.contains(input.get())) {
332 kernel_arguments.try_emplace(input.get());
334 check_error(cuMemAllocManaged(&kernel_arguments[input.get()],
336 CU_MEM_ATTACH_GLOBAL),
337 "cuMemAllocManaged");
338 check_error(cuMemcpyHtoD(kernel_arguments[input.get()],
342 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[input.get()]));
343 needed_buffers.insert(input.get());
345 if (!needed_buffers.contains(input.get())) {
346 buffers.push_back(kernel_arguments[input.get()]);
347 needed_buffers.insert(input.get());
350 for (
auto &
output : outputs) {
351 if (!kernel_arguments.contains(
output.get())) {
352 kernel_arguments.try_emplace(
output.get());
353 check_error(cuMemAllocManaged(&kernel_arguments[
output.get()],
355 CU_MEM_ATTACH_GLOBAL),
356 "cuMemAllocManaged");
357 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[
output.get()]));
358 needed_buffers.insert(
output.get());
360 if (!needed_buffers.contains(
output.get())) {
361 buffers.push_back(kernel_arguments[
output.get()]);
362 needed_buffers.insert(
output.get());
366 const size_t num_buffers = buffers.size();
368 if (!kernel_arguments.contains(state.get())) {
369 kernel_arguments.try_emplace(state.get());
370 check_error(cuMemAllocManaged(&kernel_arguments[state.get()],
371 state->get_size_bytes(),
372 CU_MEM_ATTACH_GLOBAL),
373 "cuMemAllocManaged");
374 check_error(cuMemAlloc(&offset_buffer,
sizeof(uint32_t)),
"cuMemAlloc");
375 check_error(cuMemcpyHtoD(kernel_arguments[state.get()],
377 state->get_size_bytes()),
380 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[state.get()]));
381 buffers.push_back(
reinterpret_cast<void *
> (&offset_buffer));
384#ifdef USE_CUDA_TEXTURES
385 for (
auto &[data, size] : tex1d_list) {
386 if (!texture_arguments.contains(data)) {
387 texture_arguments.try_emplace(data);
388 CUDA_RESOURCE_DESC resource_desc;
389 CUDA_TEXTURE_DESC texture_desc;
390 CUDA_ARRAY_DESCRIPTOR array_desc;
392 array_desc.Width = size;
393 array_desc.Height = 1;
395 memset(&resource_desc, 0,
sizeof(CUDA_RESOURCE_DESC));
396 memset(&texture_desc, 0,
sizeof(CUDA_TEXTURE_DESC));
398 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
399 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
400 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
401 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
403 array_desc.Format = CU_AD_FORMAT_FLOAT;
405 array_desc.NumChannels = 2;
407 array_desc.NumChannels = 1;
410 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
412 array_desc.NumChannels = 4;
414 array_desc.NumChannels = 2;
417 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
419 check_error(cuMemcpyHtoA(resource_desc.res.array.hArray, 0, data,
420 size*
sizeof(
float)*array_desc.NumChannels),
423 check_error(cuTexObjectCreate(&texture_arguments[data],
424 &resource_desc, &texture_desc,
426 "cuTexObjectCreate");
428 buffers.push_back(
reinterpret_cast<void *
> (&texture_arguments[data]));
430 for (
auto &[data, size] : tex2d_list) {
431 if (!texture_arguments.contains(data)) {
432 texture_arguments.try_emplace(data);
433 CUDA_RESOURCE_DESC resource_desc;
434 CUDA_TEXTURE_DESC texture_desc;
435 CUDA_ARRAY_DESCRIPTOR array_desc;
437 array_desc.Width = size[0];
438 array_desc.Height = size[1];
440 memset(&resource_desc, 0,
sizeof(CUDA_RESOURCE_DESC));
441 memset(&texture_desc, 0,
sizeof(CUDA_TEXTURE_DESC));
443 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
444 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
445 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
446 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
447 const size_t total = size[0]*size[1];
449 array_desc.Format = CU_AD_FORMAT_FLOAT;
451 array_desc.NumChannels = 2;
453 array_desc.NumChannels = 1;
456 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
458 array_desc.NumChannels = 4;
460 array_desc.NumChannels = 2;
463 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
466 CUDA_MEMCPY2D copy_desc;
467 memset(©_desc, 0,
sizeof(copy_desc));
469 copy_desc.srcPitch = size[0]*
sizeof(float)*array_desc.NumChannels;
470 copy_desc.srcMemoryType = CU_MEMORYTYPE_HOST;
471 copy_desc.srcHost = data;
473 copy_desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
474 copy_desc.dstArray = resource_desc.res.array.hArray;
476 copy_desc.WidthInBytes = copy_desc.srcPitch;
477 copy_desc.Height = size[0];
479 check_error(cuMemcpy2D(©_desc),
"cuMemcpy2D");
481 check_error(cuTexObjectCreate(&texture_arguments[data],
482 &resource_desc, &texture_desc,
484 "cuTexObjectCreate");
486 buffers.push_back(
reinterpret_cast<void *
> (&texture_arguments[data]));
491 check_error(cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
492 function),
"cuFuncGetAttribute");
493 unsigned int threads_per_group = value;
494 unsigned int thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0);
497 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
498 "cuOccupancyMaxPotentialBlockSize");
501 std::cout <<
" Kernel name : " << kernel_name << std::endl;
502 std::cout <<
" Threads per group : " << threads_per_group << std::endl;
503 std::cout <<
" Number of groups : " << thread_groups << std::endl;
504 std::cout <<
" Total problem size : " << threads_per_group*thread_groups << std::endl;
505 std::cout <<
" Min grid size : " << min_grid << std::endl;
506 std::cout <<
" Suggested Block size : " << value << std::endl;
510 return [
this, num_rays, function, threads_per_group, buffers] ()
mutable {
511 for (uint32_t i = 0; i < num_rays; i += threads_per_group) {
512 check_error_async(cuStreamWriteValue32(stream, offset_buffer, i,
513 CU_STREAM_WRITE_VALUE_DEFAULT),
514 "cuStreamWriteValue32");
515 check_error_async(cuLaunchKernel(function,
517 threads_per_group, 1, 1,
519 buffers.data(), NULL),
524 return [
this, function, thread_groups, threads_per_group, buffers] ()
mutable {
525 check_error_async(cuLaunchKernel(function, thread_groups, 1, 1,
526 threads_per_group, 1, 1, 0, stream,
527 buffers.data(), NULL),
541 std::function<
void(
void)> run) {
542 check_error(cuMemAllocManaged(&result_buffer,
sizeof(T),
543 CU_MEM_ATTACH_GLOBAL),
544 "cuMemAllocManaged");
546 std::vector<void *> buffers;
548 buffers.push_back(
reinterpret_cast<void *
> (&kernel_arguments[argument.get()]));
549 buffers.push_back(
reinterpret_cast<void *
> (&result_buffer));
552 check_error(cuModuleGetFunction(&function, module,
"max_reduction"),
553 "cuModuleGetFunction");
557 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
558 "cuOccupancyMaxPotentialBlockSize");
561 std::cout <<
" Kernel name : max_reduction" << std::endl;
562 std::cout <<
" Min grid size : " << min_grid << std::endl;
563 std::cout <<
" Suggested Block size : " << value << std::endl;
566 return [
this, function, run, buffers] ()
mutable {
568 check_error_async(cuLaunchKernel(function, 1, 1, 1,
569 1024, 1, 1, 0, stream,
570 buffers.data(), NULL),
574 return reinterpret_cast<T *
> (result_buffer)[0];
582 check_error_async(cuStreamSynchronize(stream),
"cuStreamSynchronize");
583 check_error(cuCtxSynchronize(),
"cuCtxSynchronize");
595 for (
auto &out : nodes) {
596 const T temp =
reinterpret_cast<T *
> (kernel_arguments[out.get()])[index];
598 std::cout << std::real(temp) <<
" " << std::imag(temp) <<
" ";
600 std::cout << temp <<
" ";
603 std::cout << std::endl;
616 return reinterpret_cast<T *
> (kernel_arguments[node.get()])[index];
628 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]),
"cuMemGetAddressRange");
629 check_error_async(cuMemcpyHtoDAsync(kernel_arguments[node.get()], source, size, stream),
"cuMemcpyHtoDAsync");
641 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]),
"cuMemGetAddressRange");
642 check_error_async(cuMemcpyDtoHAsync(destination, kernel_arguments[node.get()], size, stream),
"cuMemcpyDtoHAsync");
651 source_buffer <<
"typedef unsigned int uint32_t;" << std::endl
652 <<
"typedef unsigned short uint16_t;" << std::endl
653 <<
"typedef short int16_t;" << std::endl
654 <<
"template<typename T, size_t S>" << std::endl
655 <<
"class array {" << std::endl
656 <<
"private:" << std::endl
657 <<
" T _buffer[S];" << std::endl
658 <<
"public:" << std::endl
659 <<
" T operator[] (const size_t index) const {" << std::endl
660 <<
" return _buffer[index];" << std::endl
662 <<
" T &operator[] (const size_t index) {" << std::endl
663 <<
" return _buffer[index];" << std::endl
665 <<
"};" << std::endl;
667 source_buffer <<
"#define CUDA_DEVICE_CODE" << std::endl
668 <<
"#define M_PI " << M_PI << std::endl
669 <<
"#include <cuda/std/complex>" << std::endl
670 <<
"#include <special_functions.hpp>" << std::endl;
671#ifdef USE_CUDA_TEXTURES
673 source_buffer <<
"static __inline__ __device__ complex<float> to_cmp_float(float2 p) {"
676 jit::add_type<T> (source_buffer);
677 source_buffer <<
" (p.x, p.y);" << std::endl
680 source_buffer <<
"static __inline__ __device__ complex<double> to_cmp_double(uint4 p) {"
683 jit::add_type<T> (source_buffer);
684 source_buffer <<
" (__hiloint2double(p.y, p.x), __hiloint2double(p.w, p.z));"
689 source_buffer <<
"static __inline__ __device__ double to_double(uint2 p) {"
691 <<
" return __hiloint2double(p.y, p.x);"
714 const std::string name,
719 const std::vector<bool> &is_constant,
724 source_buffer << std::endl;
725 source_buffer <<
"extern \"C\" __global__ void "
726 << name <<
"(" << std::endl;
728 std::unordered_set<void *> used_args;
730 source_buffer <<
" ";
731 if (is_constant[0]) {
732 source_buffer <<
"const ";
734 jit::add_type<T> (source_buffer);
735 source_buffer <<
" * __restrict__ "
737 used_args.insert(inputs[0].get());
739 for (
size_t i = 1, ie = inputs.size(); i < ie; i++) {
740 if (!used_args.contains(inputs[i].get())) {
741 source_buffer <<
", // " << inputs[i - 1]->get_symbol()
742#ifndef USE_INPUT_CACHE
744 <<
" used " << usage.at(inputs[i - 1].get())
748 source_buffer <<
" ";
749 if (is_constant[i]) {
750 source_buffer <<
"const ";
752 jit::add_type<T> (source_buffer);
753 source_buffer <<
" * __restrict__ "
755 used_args.insert(inputs[i].get());
758 for (
size_t i = 0, ie = outputs.size(); i < ie; i++) {
761 source_buffer <<
", // "
762 << inputs[inputs.size() - 1]->get_symbol();
763#ifndef USE_INPUT_CACHE
765 source_buffer <<
" used "
766 << usage.at(inputs[inputs.size() - 1].get());
769 source_buffer << std::endl;
772 source_buffer <<
"," << std::endl;
775 if (!used_args.contains(outputs[i].get())) {
776 source_buffer <<
" ";
777 jit::add_type<T> (source_buffer);
778 source_buffer <<
" * __restrict__ "
780 used_args.insert(outputs[i].get());
784 source_buffer <<
"," << std::endl
785 <<
" mt_state * __restrict__ "
788 <<
" const uint32_t * __restrict__ offset"
791#ifdef USE_CUDA_TEXTURES
792 for (
auto &[key, value] : textures1d) {
793 source_buffer <<
"," << std::endl;
794 source_buffer <<
" cudaTextureObject_t "
797 for (
auto &[key, value] : textures2d) {
798 source_buffer <<
"," << std::endl;
799 source_buffer <<
" cudaTextureObject_t "
803 source_buffer <<
") {" << std::endl
804 <<
" const int index = blockIdx.x*blockDim.x + threadIdx.x;"
807#ifdef USE_INPUT_CACHE
809 source_buffer <<
" mt_state &" << registers[state.get()] <<
" = "
813 <<
" // used " << usage.at(state.get())
817 registers[state.get()] =
jit::to_string(
's', state.get()) +
"[threadIdx.x]";
820 source_buffer <<
" if (";
822 source_buffer <<
"offset[0] + ";
824 source_buffer <<
"index < " << size <<
") {" << std::endl;
827 for (
auto &input : inputs) {
828#ifdef USE_INPUT_CACHE
829 if (usage.at(input.get())) {
831 source_buffer <<
" const ";
832 jit::add_type<T> (source_buffer);
833 source_buffer <<
" " << registers[input.get()] <<
" = "
837 source_buffer <<
"offset[0] + ";
839 source_buffer <<
"index]; // " << input->get_symbol()
841 <<
" used " << usage.at(input.get())
846 registers[input.get()] =
jit::to_string(
'v', input.get()) +
"[index]";
869 std::unordered_set<void *> out_registers;
870 for (
auto &[out, in] : setters) {
871 if (!out->is_match(in) &&
872 !out_registers.contains(out.get())) {
881 source_buffer <<
"offset[0] + ";
883 source_buffer <<
"index] = ";
884 if constexpr (SAFE_MATH) {
886 jit::add_type<T> (source_buffer);
887 source_buffer <<
" (";
888 source_buffer <<
"isnan(real(" << registers[a.get()]
889 <<
")) ? 0.0 : real("
890 << registers[a.get()]
892 source_buffer <<
"isnan(imag(" << registers[a.get()]
893 <<
")) ? 0.0 : imag("
894 << registers[a.get()]
895 <<
"));" << std::endl;
897 source_buffer <<
"isnan(" << registers[a.get()]
898 <<
") ? 0.0 : " << registers[a.get()]
902 source_buffer << registers[a.get()] <<
";" << std::endl;
904 out_registers.insert(out.get());
908 for (
auto &out : outputs) {
910 !out_registers.contains(out.get())) {
919 source_buffer <<
"offset[0] + ";
921 source_buffer <<
"index] = ";
922 if constexpr (SAFE_MATH) {
924 jit::add_type<T> (source_buffer);
925 source_buffer <<
" (";
926 source_buffer <<
"isnan(real(" << registers[a.get()]
927 <<
")) ? 0.0 : real("
928 << registers[a.get()]
930 source_buffer <<
"isnan(imag(" << registers[a.get()]
931 <<
")) ? 0.0 : imag("
932 << registers[a.get()]
933 <<
"));" << std::endl;
935 source_buffer <<
"isnan(" << registers[a.get()]
936 <<
") ? 0.0 : " << registers[a.get()]
940 source_buffer << registers[a.get()] <<
";" << std::endl;
942 out_registers.insert(out.get());
946 source_buffer <<
" }" << std::endl <<
"}" << std::endl;
957 source_buffer << std::endl;
958 source_buffer <<
"extern \"C\" __global__ void max_reduction(" << std::endl;
959 source_buffer <<
" const ";
960 jit::add_type<T> (source_buffer);
961 source_buffer <<
" * __restrict__ input," << std::endl;
962 source_buffer <<
" ";
963 jit::add_type<T> (source_buffer);
964 source_buffer <<
" * __restrict__ result) {" << std::endl;
965 source_buffer <<
" const unsigned int i = threadIdx.x;" << std::endl;
966 source_buffer <<
" const unsigned int j = threadIdx.x/32;" << std::endl;
967 source_buffer <<
" const unsigned int k = threadIdx.x%32;" << std::endl;
968 source_buffer <<
" if (i < " << size <<
") {" << std::endl;
969 source_buffer <<
" " << jit::type_to_string<T> () <<
" sub_max = ";
971 source_buffer <<
"abs(input[i]);" << std::endl;
973 source_buffer <<
"input[i];" << std::endl;
975 source_buffer <<
" for (size_t index = i + 1024; index < " << size <<
"; index += 1024) {" << std::endl;
977 source_buffer <<
" sub_max = max(sub_max, abs(input[index]));" << std::endl;
979 source_buffer <<
" sub_max = max(sub_max, input[index]);" << std::endl;
981 source_buffer <<
" }" << std::endl;
982 source_buffer <<
" __shared__ " << jit::type_to_string<T> () <<
" thread_max[32];" << std::endl;
983 source_buffer <<
" for (int index = 16; index > 0; index /= 2) {" << std::endl;
984 source_buffer <<
" sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
985 source_buffer <<
" }" << std::endl;
986 source_buffer <<
" thread_max[j] = sub_max;" << std::endl;
987 source_buffer <<
" __syncthreads();" << std::endl;
988 source_buffer <<
" if (j == 0) {" << std::endl;
989 source_buffer <<
" for (int index = 16; index > 0; index /= 2) {" << std::endl;
990 source_buffer <<
" thread_max[k] = max(thread_max[k], __shfl_down_sync(__activemask(), thread_max[k], index));" << std::endl;
991 source_buffer <<
" }" << std::endl;
992 source_buffer <<
" *result = thread_max[0];" << std::endl;
993 source_buffer <<
" }" << std::endl;
994 source_buffer <<
" }" << std::endl;
995 source_buffer <<
"}" << std::endl << std::endl;
1004 return reinterpret_cast<T *
> (kernel_arguments[node.get()]);
Class representing a generic buffer.
Definition backend.hpp:29
Class representing a cuda gpu context.
Definition cuda_context.hpp:73
T * get_buffer(graph::shared_leaf< T, SAFE_MATH > &node)
Get the buffer for a node.
Definition cuda_context.hpp:1003
~cuda_context()
Cuda context destructor.
Definition cuda_context.hpp:153
int remaining_const_memory
Remaining constant memory in bytes.
Definition cuda_context.hpp:114
void copy_to_host(graph::shared_leaf< T, SAFE_MATH > node, T *destination)
Copy buffer contents to host.
Definition cuda_context.hpp:638
void create_kernel_postfix(std::ostringstream &source_buffer, graph::output_nodes< T, SAFE_MATH > &outputs, graph::map_nodes< T, SAFE_MATH > &setters, graph::shared_random_state< T, SAFE_MATH > state, jit::register_map ®isters, jit::register_map &indices, const jit::register_usage &usage)
Create kernel postfix.
Definition cuda_context.hpp:862
void create_reduction(std::ostringstream &source_buffer, const size_t size)
Create reduction.
Definition cuda_context.hpp:955
std::function< T(void)> create_max_call(graph::shared_leaf< T, SAFE_MATH > &argument, std::function< void(void)> run)
Create a max compute kernel calling function.
Definition cuda_context.hpp:540
std::function< void(void)> create_kernel_call(const std::string kernel_name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t num_rays, const jit::texture1d_list &tex1d_list, const jit::texture2d_list &tex2d_list)
Create a kernel calling function.
Definition cuda_context.hpp:316
cuda_context(const size_t index)
Cuda context constructor.
Definition cuda_context.hpp:139
static size_t max_concurrency()
Get the maximum number of concurrent instances.
Definition cuda_context.hpp:121
void copy_to_device(graph::shared_leaf< T, SAFE_MATH > node, T *source)
Copy buffer contents to the device.
Definition cuda_context.hpp:625
static std::string device_type()
Device description.
Definition cuda_context.hpp:130
void print_results(const size_t index, const graph::output_nodes< T, SAFE_MATH > &nodes)
Print out the results.
Definition cuda_context.hpp:592
static constexpr size_t random_state_size
Size of random state needed.
Definition cuda_context.hpp:111
void create_kernel_prefix(std::ostringstream &source_buffer, const std::string name, graph::input_nodes< T, SAFE_MATH > &inputs, graph::output_nodes< T, SAFE_MATH > &outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t size, const std::vector< bool > &is_constant, jit::register_map ®isters, const jit::register_usage &usage, jit::texture1d_list &textures1d, jit::texture2d_list &textures2d)
Create kernel prefix.
Definition cuda_context.hpp:713
T check_value(const size_t index, const graph::shared_leaf< T, SAFE_MATH > &node)
Check the value.
Definition cuda_context.hpp:613
void wait()
Hold the current thread until the stream has completed.
Definition cuda_context.hpp:581
void create_header(std::ostringstream &source_buffer)
Create the source header.
Definition cuda_context.hpp:650
void compile(const std::string kernel_source, std::vector< std::string > names, const bool add_reduction=false)
Compile the kernels.
Definition cuda_context.hpp:194
Complex scalar concept.
Definition register.hpp:24
Double base concept.
Definition register.hpp:42
float base concept.
Definition register.hpp:37
#define MAX_REG
Maximum number of registers to use.
Definition cuda_context.hpp:21
subroutine assert(test, message)
Assert check.
Definition f_binding_test.f90:38
Name space for backend buffers.
Definition backend.hpp:19
Name space for GPU backends.
Definition cpu_context.hpp:51
std::shared_ptr< random_state_node< T, SAFE_MATH > > shared_random_state
Convenience type alias for shared sqrt nodes.
Definition random.hpp:263
std::vector< shared_variable< T, SAFE_MATH > > input_nodes
Convenience type alias for a vector of inputs.
Definition node.hpp:1711
shared_variable< T, SAFE_MATH > variable_cast(shared_leaf< T, SAFE_MATH > x)
Cast to a variable node.
Definition node.hpp:1727
std::shared_ptr< leaf_node< T, SAFE_MATH > > shared_leaf
Convenience type alias for shared leaf nodes.
Definition node.hpp:676
std::vector< std::pair< shared_leaf< T, SAFE_MATH >, shared_variable< T, SAFE_MATH > > > map_nodes
Convenience type alias for mapping end codes back to inputs.
Definition node.hpp:1715
std::vector< shared_leaf< T, SAFE_MATH > > output_nodes
Convenience type alias for a vector of output nodes.
Definition node.hpp:691
std::map< void *, size_t > texture1d_list
Type alias for indexing 1D textures.
Definition register.hpp:263
std::map< void *, std::array< size_t, 2 > > texture2d_list
Type alias for indexing 2D textures.
Definition register.hpp:265
std::map< void *, size_t > register_usage
Type alias for counting register usage.
Definition register.hpp:259
std::map< void *, std::string > register_map
Type alias for mapping node pointers to register names.
Definition register.hpp:257
std::string to_string(const char prefix, const NODE *pointer)
Convert a graph::leaf_node pointer to a string.
Definition register.hpp:246
Name space for output files.
Definition output.hpp:16
Random constants and distributions.