Graph Framework
Loading...
Searching...
No Matches
cuda_context.hpp
Go to the documentation of this file.
1//------------------------------------------------------------------------------
6//------------------------------------------------------------------------------
7
8#ifndef cuda_context_h
9#define cuda_context_h
10
11#include <unordered_set>
12#include <array>
13#include <cstring>
14
15#include <cuda.h>
16#include <nvrtc.h>
17
18#include "random.hpp"
19
21#define MAX_REG 128
22
23namespace gpu {
24//------------------------------------------------------------------------------
29//------------------------------------------------------------------------------
30 static void check_nvrtc_error(nvrtcResult result,
31 const std::string &name) {
32#ifndef NDEBUG
33 assert(result == NVRTC_SUCCESS && nvrtcGetErrorString(result));
34#endif
35 }
36
37//------------------------------------------------------------------------------
42//------------------------------------------------------------------------------
43 static void check_error(CUresult result,
44 const std::string &name) {
45#ifndef NDEBUG
46 const char *error;
47 cuGetErrorString(result, &error);
48 if (result != CUDA_SUCCESS) {
49 std::cerr << name << " " << std::string(error) << std::endl;
50 }
51 assert(result == CUDA_SUCCESS && error);
52#endif
53 }
54
55//------------------------------------------------------------------------------
57//------------------------------------------------------------------------------
58 static CUresult cuda_init() {
59 const CUresult result = cuInit(0);
60 check_error(result, "cuInit");
61 return result;
62 }
64 static const CUresult result = cuda_init();
65
66//------------------------------------------------------------------------------
71//------------------------------------------------------------------------------
72 template<jit::float_scalar T, bool SAFE_MATH=false>
74 private:
76 CUdevice device;
78 CUcontext context;
80 CUmodule module;
82 std::map<graph::leaf_node<T, SAFE_MATH> *, CUdeviceptr> kernel_arguments;
83#ifdef USE_CUDA_TEXTURES
85 std::map<void *, CUtexObject> texture_arguments;
86#endif
88 CUdeviceptr result_buffer;
90 CUdeviceptr offset_buffer;
92 CUstream stream;
93
94//------------------------------------------------------------------------------
99//------------------------------------------------------------------------------
100 void check_error_async(CUresult result,
101 const std::string &name) {
102 check_error(result, name);
103#ifndef NDEBUG
104 std::string async_name = name + "_async";
105 check_error(cuStreamSynchronize(stream), async_name);
106#endif
107 }
108
109 public:
111 constexpr static size_t random_state_size = 1024;
112
115
116//------------------------------------------------------------------------------
120//------------------------------------------------------------------------------
121 static size_t max_concurrency() {
122 int count;
123 check_error(cuDeviceGetCount(&count), "cuDeviceGetCount");
124 return count;
125 }
126
127//------------------------------------------------------------------------------
129//------------------------------------------------------------------------------
130 static std::string device_type() {
131 return "Cuda GPU";
132 }
133
134//------------------------------------------------------------------------------
138//------------------------------------------------------------------------------
139 cuda_context(const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
140 check_error(cuDeviceGet(&device, index), "cuDeviceGet");
141 check_error(cuDevicePrimaryCtxRetain(&context, device), "cuDevicePrimaryCtxRetain");
142 check_error(cuCtxSetCurrent(context), "cuCtxSetCurrent");
143 check_error(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1), "cuCtxSetCacheConfig");
144 check_error(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "cuStreamCreate");
145 check_error(cuDeviceGetAttribute(&remaining_const_memory,
146 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
147 device), "cuDeviceGetAttribute");
148 }
149
150//------------------------------------------------------------------------------
152//------------------------------------------------------------------------------
154 if (module) {
155 check_error(cuModuleUnload(module), "cuModuleUnload");
156 module = 0;
157 }
158
159 for (auto &[key, value] : kernel_arguments) {
160 check_error(cuMemFree(value), "cuMemFree");
161 }
162
163#ifdef USE_CUDA_TEXTURES
164 for (auto &[key, value] : texture_arguments) {
165 CUDA_RESOURCE_DESC resource;
166 check_error(cuTexObjectGetResourceDesc(&resource, value),
167 "cuTexObjectGetResourceDesc");
168
169 check_error(cuArrayDestroy(resource.res.array.hArray), "cuArrayDestroy");
170 check_error(cuTexObjectDestroy(value), "cuTexObjectDestroy");
171 }
172#endif
173
174 if (result_buffer) {
175 check_error(cuMemFree(result_buffer), "cuMemFree");
176 result_buffer = 0;
177 }
178 if (offset_buffer) {
179 check_error(cuMemFree(offset_buffer), "cuMemFree");
180 offset_buffer = 0;
181 }
182
183 check_error(cuStreamDestroy(stream), "cuStreamDestroy");
184 check_error(cuDevicePrimaryCtxRelease(device), "cuDevicePrimaryCtxRelease");
185 }
186
187//------------------------------------------------------------------------------
193//------------------------------------------------------------------------------
194 void compile(const std::string kernel_source,
195 std::vector<std::string> names,
196 const bool add_reduction=false) {
197 if (add_reduction) {
198 names.push_back("max_reduction");
199 }
200
201 nvrtcProgram kernel_program;
202 check_nvrtc_error(nvrtcCreateProgram(&kernel_program,
203 kernel_source.c_str(),
204 NULL, 0, NULL, NULL),
205 "nvrtcCreateProgram");
206
207 for (std::string &name : names) {
208 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
209 name.c_str()),
210 "nvrtcAddNameExpression");
211 }
212 if (add_reduction) {
213 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
214 "max_reduction"),
215 "nvrtcAddNameExpression");
216 }
217
218 std::ostringstream arch;
219 int compute_version;
220 check_error(cuDeviceGetAttribute(&compute_version,
221 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
222 device), "cuDeviceGetAttribute");
223 arch << "--gpu-architecture=compute_";
224 arch << compute_version;
225 if (jit::verbose) {
226 std::cout << "CUDA GPU info." << std::endl;
227 std::cout << " Major compute capability : " << compute_version << std::endl;
228 }
229
230 check_error(cuDeviceGetAttribute(&compute_version,
231 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
232 device), "cuDeviceGetAttribute");
233 arch << compute_version;
234 if (jit::verbose) {
235 std::cout << " Minor compute capability : " << compute_version << std::endl;
236 }
237
238 char device_name[100];
239 check_error(cuDeviceGetName(device_name, 100, device), "cuDeviceGetName");
240 if (jit::verbose) {
241 std::cout << " Device name : " << device_name << std::endl;
242 }
243
244 const std::string temp = arch.str();
245 std::array<const char *, 8> options({
246 temp.c_str(),
247 "--std=c++17",
248 "--relocatable-device-code=false",
249 "--include-path=" CUDA_INCLUDE,
250 "--include-path=" HEADER_DIR,
251 "--extra-device-vectorization",
252 "--device-as-default-execution-space",
253 "--use_fast_math"
254 });
255
256 if (nvrtcCompileProgram(kernel_program, options.size(), options.data())) {
257 size_t log_size;
258 check_nvrtc_error(nvrtcGetProgramLogSize(kernel_program, &log_size),
259 "nvrtcGetProgramLogSize");
260
261 char *log = static_cast<char *> (malloc(log_size));
262 check_nvrtc_error(nvrtcGetProgramLog(kernel_program, log),
263 "nvrtcGetProgramLog");
264 std::cout << log << std::endl;
265 free(log);
266 std::cout << kernel_source << std::endl;
267 }
268
269 check_error(cuDeviceGetAttribute(&compute_version,
270 CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
271 device), "cuDeviceGetAttribute");
272 if (jit::verbose) {
273 std::cout << " Managed Memory : " << compute_version << std::endl;
274 }
275
276 size_t ptx_size;
277 check_nvrtc_error(nvrtcGetPTXSize(kernel_program, &ptx_size),
278 "nvrtcGetPTXSize");
279
280 char *ptx = static_cast<char *> (malloc(ptx_size));
281 check_nvrtc_error(nvrtcGetPTX(kernel_program, ptx), "nvrtcGetPTX");
282
283 check_nvrtc_error(nvrtcDestroyProgram(&kernel_program),
284 "nvrtcDestroyProgram");
285
286 std::array<CUjit_option, 3> module_options = {
287 CU_JIT_MAX_REGISTERS,
288 CU_JIT_LTO,
289 CU_JIT_POSITION_INDEPENDENT_CODE
290 };
291 std::array<void *, 3> module_values = {
292 reinterpret_cast<void *> (MAX_REG),
293 reinterpret_cast<void *> (1),
294 reinterpret_cast<void *> (0)
295 };
296
297 check_error(cuModuleLoadDataEx(&module, ptx, module_options.size(),
298 module_options.data(),
299 module_values.data()), "cuModuleLoadDataEx");
300
301 free(ptx);
302 }
303
304//------------------------------------------------------------------------------
315//------------------------------------------------------------------------------
316 std::function<void(void)> create_kernel_call(const std::string kernel_name,
320 const size_t num_rays,
321 const jit::texture1d_list &tex1d_list,
322 const jit::texture2d_list &tex2d_list) {
323 CUfunction function;
324 check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()), "cuModuleGetFunction");
325
326 std::vector<void *> buffers;
327 std::set<graph::leaf_node<float, SAFE_MATH> *> needed_buffers;
328
329 const size_t buffer_element_size = sizeof(T);
330 for (auto &input : inputs) {
331 if (!kernel_arguments.contains(input.get())) {
332 kernel_arguments.try_emplace(input.get());
333 const backend::buffer<T> backend = input->evaluate();
334 check_error(cuMemAllocManaged(&kernel_arguments[input.get()],
335 backend.size()*sizeof(T),
336 CU_MEM_ATTACH_GLOBAL),
337 "cuMemAllocManaged");
338 check_error(cuMemcpyHtoD(kernel_arguments[input.get()],
339 &backend[0],
340 backend.size()*sizeof(T)),
341 "cuMemcpyHtoD");
342 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[input.get()]));
343 needed_buffers.insert(input.get());
344 }
345 if (!needed_buffers.contains(input.get())) {
346 buffers.push_back(kernel_arguments[input.get()]);
347 needed_buffers.insert(input.get());
348 }
349 }
350 for (auto &output : outputs) {
351 if (!kernel_arguments.contains(output.get())) {
352 kernel_arguments.try_emplace(output.get());
353 check_error(cuMemAllocManaged(&kernel_arguments[output.get()],
354 num_rays*sizeof(T),
355 CU_MEM_ATTACH_GLOBAL),
356 "cuMemAllocManaged");
357 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[output.get()]));
358 needed_buffers.insert(output.get());
359 }
360 if (!needed_buffers.contains(output.get())) {
361 buffers.push_back(kernel_arguments[output.get()]);
362 needed_buffers.insert(output.get());
363 }
364 }
365
366 const size_t num_buffers = buffers.size();
367 if (state.get()) {
368 if (!kernel_arguments.contains(state.get())) {
369 kernel_arguments.try_emplace(state.get());
370 check_error(cuMemAllocManaged(&kernel_arguments[state.get()],
371 state->get_size_bytes(),
372 CU_MEM_ATTACH_GLOBAL),
373 "cuMemAllocManaged");
374 check_error(cuMemAlloc(&offset_buffer, sizeof(uint32_t)), "cuMemAlloc");
375 check_error(cuMemcpyHtoD(kernel_arguments[state.get()],
376 state->data(),
377 state->get_size_bytes()),
378 "cuMemcpyHtoD");
379 }
380 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[state.get()]));
381 buffers.push_back(reinterpret_cast<void *> (&offset_buffer));
382 }
383
384#ifdef USE_CUDA_TEXTURES
385 for (auto &[data, size] : tex1d_list) {
386 if (!texture_arguments.contains(data)) {
387 texture_arguments.try_emplace(data);
388 CUDA_RESOURCE_DESC resource_desc;
389 CUDA_TEXTURE_DESC texture_desc;
390 CUDA_ARRAY_DESCRIPTOR array_desc;
391
392 array_desc.Width = size;
393 array_desc.Height = 1;
394
395 memset(&resource_desc, 0, sizeof(CUDA_RESOURCE_DESC));
396 memset(&texture_desc, 0, sizeof(CUDA_TEXTURE_DESC));
397
398 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
399 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
400 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
401 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
402 if constexpr (jit::float_base<T>) {
403 array_desc.Format = CU_AD_FORMAT_FLOAT;
404 if constexpr (jit::complex_scalar<T>) {
405 array_desc.NumChannels = 2;
406 } else {
407 array_desc.NumChannels = 1;
408 }
409 } else {
410 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
411 if constexpr (jit::complex_scalar<T>) {
412 array_desc.NumChannels = 4;
413 } else {
414 array_desc.NumChannels = 2;
415 }
416 }
417 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
418 "cuArrayCreate");
419 check_error(cuMemcpyHtoA(resource_desc.res.array.hArray, 0, data,
420 size*sizeof(float)*array_desc.NumChannels),
421 "cuMemcpyHtoA");
422
423 check_error(cuTexObjectCreate(&texture_arguments[data],
424 &resource_desc, &texture_desc,
425 NULL),
426 "cuTexObjectCreate");
427 }
428 buffers.push_back(reinterpret_cast<void *> (&texture_arguments[data]));
429 }
430 for (auto &[data, size] : tex2d_list) {
431 if (!texture_arguments.contains(data)) {
432 texture_arguments.try_emplace(data);
433 CUDA_RESOURCE_DESC resource_desc;
434 CUDA_TEXTURE_DESC texture_desc;
435 CUDA_ARRAY_DESCRIPTOR array_desc;
436
437 array_desc.Width = size[0];
438 array_desc.Height = size[1];
439
440 memset(&resource_desc, 0, sizeof(CUDA_RESOURCE_DESC));
441 memset(&texture_desc, 0, sizeof(CUDA_TEXTURE_DESC));
442
443 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
444 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
445 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
446 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
447 const size_t total = size[0]*size[1];
448 if constexpr (jit::float_base<T>) {
449 array_desc.Format = CU_AD_FORMAT_FLOAT;
450 if constexpr (jit::complex_scalar<T>) {
451 array_desc.NumChannels = 2;
452 } else {
453 array_desc.NumChannels = 1;
454 }
455 } else {
456 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
457 if constexpr (jit::complex_scalar<T>) {
458 array_desc.NumChannels = 4;
459 } else {
460 array_desc.NumChannels = 2;
461 }
462 }
463 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
464 "cuArrayCreate");
465
466 CUDA_MEMCPY2D copy_desc;
467 memset(&copy_desc, 0, sizeof(copy_desc));
468
469 copy_desc.srcPitch = size[0]*sizeof(float)*array_desc.NumChannels;
470 copy_desc.srcMemoryType = CU_MEMORYTYPE_HOST;
471 copy_desc.srcHost = data;
472
473 copy_desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
474 copy_desc.dstArray = resource_desc.res.array.hArray;
475
476 copy_desc.WidthInBytes = copy_desc.srcPitch;
477 copy_desc.Height = size[0];
478
479 check_error(cuMemcpy2D(&copy_desc), "cuMemcpy2D");
480
481 check_error(cuTexObjectCreate(&texture_arguments[data],
482 &resource_desc, &texture_desc,
483 NULL),
484 "cuTexObjectCreate");
485 }
486 buffers.push_back(reinterpret_cast<void *> (&texture_arguments[data]));
487 }
488#endif
489
490 int value;
491 check_error(cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
492 function), "cuFuncGetAttribute");
493 unsigned int threads_per_group = value;
494 unsigned int thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0);
495
496 int min_grid;
497 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
498 "cuOccupancyMaxPotentialBlockSize");
499
500 if (jit::verbose) {
501 std::cout << " Kernel name : " << kernel_name << std::endl;
502 std::cout << " Threads per group : " << threads_per_group << std::endl;
503 std::cout << " Number of groups : " << thread_groups << std::endl;
504 std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl;
505 std::cout << " Min grid size : " << min_grid << std::endl;
506 std::cout << " Suggested Block size : " << value << std::endl;
507 }
508
509 if (state.get()) {
510 return [this, num_rays, function, threads_per_group, buffers] () mutable {
511 for (uint32_t i = 0; i < num_rays; i += threads_per_group) {
512 check_error_async(cuStreamWriteValue32(stream, offset_buffer, i,
513 CU_STREAM_WRITE_VALUE_DEFAULT),
514 "cuStreamWriteValue32");
515 check_error_async(cuLaunchKernel(function,
516 1, 1, 1,
517 threads_per_group, 1, 1,
518 0, stream,
519 buffers.data(), NULL),
520 "cuLaunchKernel");
521 }
522 };
523 } else {
524 return [this, function, thread_groups, threads_per_group, buffers] () mutable {
525 check_error_async(cuLaunchKernel(function, thread_groups, 1, 1,
526 threads_per_group, 1, 1, 0, stream,
527 buffers.data(), NULL),
528 "cuLaunchKernel");
529 };
530 }
531 }
532
533//------------------------------------------------------------------------------
539//------------------------------------------------------------------------------
540 std::function<T(void)> create_max_call(graph::shared_leaf<T, SAFE_MATH> &argument,
541 std::function<void(void)> run) {
542 check_error(cuMemAllocManaged(&result_buffer, sizeof(T),
543 CU_MEM_ATTACH_GLOBAL),
544 "cuMemAllocManaged");
545
546 std::vector<void *> buffers;
547
548 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[argument.get()]));
549 buffers.push_back(reinterpret_cast<void *> (&result_buffer));
550
551 CUfunction function;
552 check_error(cuModuleGetFunction(&function, module, "max_reduction"),
553 "cuModuleGetFunction");
554
555 int value;
556 int min_grid;
557 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
558 "cuOccupancyMaxPotentialBlockSize");
559
560 if (jit::verbose) {
561 std::cout << " Kernel name : max_reduction" << std::endl;
562 std::cout << " Min grid size : " << min_grid << std::endl;
563 std::cout << " Suggested Block size : " << value << std::endl;
564 }
565
566 return [this, function, run, buffers] () mutable {
567 run();
568 check_error_async(cuLaunchKernel(function, 1, 1, 1,
569 1024, 1, 1, 0, stream,
570 buffers.data(), NULL),
571 "cuLaunchKernel");
572 wait();
573
574 return reinterpret_cast<T *> (result_buffer)[0];
575 };
576 }
577
578//------------------------------------------------------------------------------
580//------------------------------------------------------------------------------
581 void wait() {
582 check_error_async(cuStreamSynchronize(stream), "cuStreamSynchronize");
583 check_error(cuCtxSynchronize(), "cuCtxSynchronize");
584 }
585
586//------------------------------------------------------------------------------
591//------------------------------------------------------------------------------
592 void print_results(const size_t index,
594 wait();
595 for (auto &out : nodes) {
596 const T temp = reinterpret_cast<T *> (kernel_arguments[out.get()])[index];
597 if constexpr (jit::complex_scalar<T>) {
598 std::cout << std::real(temp) << " " << std::imag(temp) << " ";
599 } else {
600 std::cout << temp << " ";
601 }
602 }
603 std::cout << std::endl;
604 }
605
606//------------------------------------------------------------------------------
612//------------------------------------------------------------------------------
613 T check_value(const size_t index,
615 wait();
616 return reinterpret_cast<T *> (kernel_arguments[node.get()])[index];
617 }
618
619//------------------------------------------------------------------------------
624//------------------------------------------------------------------------------
626 T *source) {
627 size_t size;
628 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]), "cuMemGetAddressRange");
629 check_error_async(cuMemcpyHtoDAsync(kernel_arguments[node.get()], source, size, stream), "cuMemcpyHtoDAsync");
630 }
631
632//------------------------------------------------------------------------------
637//------------------------------------------------------------------------------
639 T *destination) {
640 size_t size;
641 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]), "cuMemGetAddressRange");
642 check_error_async(cuMemcpyDtoHAsync(destination, kernel_arguments[node.get()], size, stream), "cuMemcpyDtoHAsync");
643 }
644
645//------------------------------------------------------------------------------
649//------------------------------------------------------------------------------
650 void create_header(std::ostringstream &source_buffer) {
651 source_buffer << "typedef unsigned int uint32_t;" << std::endl
652 << "typedef unsigned short uint16_t;" << std::endl
653 << "typedef short int16_t;" << std::endl
654 << "template<typename T, size_t S>" << std::endl
655 << "class array {" << std::endl
656 << "private:" << std::endl
657 << " T _buffer[S];" << std::endl
658 << "public:" << std::endl
659 << " T operator[] (const size_t index) const {" << std::endl
660 << " return _buffer[index];" << std::endl
661 << " }" << std::endl
662 << " T &operator[] (const size_t index) {" << std::endl
663 << " return _buffer[index];" << std::endl
664 << " }" << std::endl
665 << "};" << std::endl;
666 if constexpr (jit::complex_scalar<T>) {
667 source_buffer << "#define CUDA_DEVICE_CODE" << std::endl
668 << "#define M_PI " << M_PI << std::endl
669 << "#include <cuda/std/complex>" << std::endl
670 << "#include <special_functions.hpp>" << std::endl;
671#ifdef USE_CUDA_TEXTURES
672 if constexpr (jit::float_base<T>) {
673 source_buffer << "static __inline__ __device__ complex<float> to_cmp_float(float2 p) {"
674 << std::endl
675 << " return ";
676 jit::add_type<T> (source_buffer);
677 source_buffer << " (p.x, p.y);" << std::endl
678 << "}" << std::endl;
679 } else {
680 source_buffer << "static __inline__ __device__ complex<double> to_cmp_double(uint4 p) {"
681 << std::endl
682 << " return ";
683 jit::add_type<T> (source_buffer);
684 source_buffer << " (__hiloint2double(p.y, p.x), __hiloint2double(p.w, p.z));"
685 << std::endl
686 << "}" << std::endl;
687 }
688 } else if constexpr (jit::double_base<T>) {
689 source_buffer << "static __inline__ __device__ double to_double(uint2 p) {"
690 << std::endl
691 << " return __hiloint2double(p.y, p.x);"
692 << std::endl
693 << "}" << std::endl;
694#endif
695 }
696 }
697
698//------------------------------------------------------------------------------
712//------------------------------------------------------------------------------
713 void create_kernel_prefix(std::ostringstream &source_buffer,
714 const std::string name,
718 const size_t size,
719 const std::vector<bool> &is_constant,
720 jit::register_map &registers,
721 const jit::register_usage &usage,
722 jit::texture1d_list &textures1d,
723 jit::texture2d_list &textures2d) {
724 source_buffer << std::endl;
725 source_buffer << "extern \"C\" __global__ void "
726 << name << "(" << std::endl;
727
728 std::unordered_set<void *> used_args;
729 if (inputs.size()) {
730 source_buffer << " ";
731 if (is_constant[0]) {
732 source_buffer << "const ";
733 }
734 jit::add_type<T> (source_buffer);
735 source_buffer << " * __restrict__ "
736 << jit::to_string('v', inputs[0].get());
737 used_args.insert(inputs[0].get());
738 }
739 for (size_t i = 1, ie = inputs.size(); i < ie; i++) {
740 if (!used_args.contains(inputs[i].get())) {
741 source_buffer << ", // " << inputs[i - 1]->get_symbol()
742#ifndef USE_INPUT_CACHE
743#ifdef SHOW_USE_COUNT
744 << " used " << usage.at(inputs[i - 1].get())
745#endif
746#endif
747 << std::endl;
748 source_buffer << " ";
749 if (is_constant[i]) {
750 source_buffer << "const ";
751 }
752 jit::add_type<T> (source_buffer);
753 source_buffer << " * __restrict__ "
754 << jit::to_string('v', inputs[i].get());
755 used_args.insert(inputs[i].get());
756 }
757 }
758 for (size_t i = 0, ie = outputs.size(); i < ie; i++) {
759 if (i == 0) {
760 if (inputs.size()) {
761 source_buffer << ", // "
762 << inputs[inputs.size() - 1]->get_symbol();
763#ifndef USE_INPUT_CACHE
764#ifdef SHOW_USE_COUNT
765 source_buffer << " used "
766 << usage.at(inputs[inputs.size() - 1].get());
767#endif
768#endif
769 source_buffer << std::endl;
770 }
771 } else {
772 source_buffer << "," << std::endl;
773 }
774
775 if (!used_args.contains(outputs[i].get())) {
776 source_buffer << " ";
777 jit::add_type<T> (source_buffer);
778 source_buffer << " * __restrict__ "
779 << jit::to_string('o', outputs[i].get());
780 used_args.insert(outputs[i].get());
781 }
782 }
783 if (state.get()) {
784 source_buffer << "," << std::endl
785 << " mt_state * __restrict__ "
786 << jit::to_string('s', state.get())
787 << "," << std::endl
788 << " const uint32_t * __restrict__ offset"
789 << std::endl;
790 }
791#ifdef USE_CUDA_TEXTURES
792 for (auto &[key, value] : textures1d) {
793 source_buffer << "," << std::endl;
794 source_buffer << " cudaTextureObject_t "
795 << jit::to_string('a', key);
796 }
797 for (auto &[key, value] : textures2d) {
798 source_buffer << "," << std::endl;
799 source_buffer << " cudaTextureObject_t "
800 << jit::to_string('a', key);
801 }
802#endif
803 source_buffer << ") {" << std::endl
804 << " const int index = blockIdx.x*blockDim.x + threadIdx.x;"
805 << std::endl;
806 if (state.get()) {
807#ifdef USE_INPUT_CACHE
808 registers[state.get()] = jit::to_string('r', state.get());
809 source_buffer << " mt_state &" << registers[state.get()] << " = "
810 << jit::to_string('s', state.get())
811 << "[threadIdx.x];"
812#ifdef SHOW_USE_COUNT
813 << " // used " << usage.at(state.get())
814#endif
815 << std::endl;
816#else
817 registers[state.get()] = jit::to_string('s', state.get()) + "[threadIdx.x]";
818#endif
819 }
820 source_buffer << " if (";
821 if (state.get()) {
822 source_buffer << "offset[0] + ";
823 }
824 source_buffer << "index < " << size << ") {" << std::endl;
825
826
827 for (auto &input : inputs) {
828#ifdef USE_INPUT_CACHE
829 if (usage.at(input.get())) {
830 registers[input.get()] = jit::to_string('r', input.get());
831 source_buffer << " const ";
832 jit::add_type<T> (source_buffer);
833 source_buffer << " " << registers[input.get()] << " = "
834 << jit::to_string('v', input.get())
835 << "[";
836 if (state.get()) {
837 source_buffer << "offset[0] + ";
838 }
839 source_buffer << "index]; // " << input->get_symbol()
840#ifdef SHOW_USE_COUNT
841 << " used " << usage.at(input.get())
842#endif
843 << std::endl;
844 }
845#else
846 registers[input.get()] = jit::to_string('v', input.get()) + "[index]";
847#endif
848 }
849 }
850
851//------------------------------------------------------------------------------
861//------------------------------------------------------------------------------
862 void create_kernel_postfix(std::ostringstream &source_buffer,
866 jit::register_map &registers,
867 jit::register_map &indices,
868 const jit::register_usage &usage) {
869 std::unordered_set<void *> out_registers;
870 for (auto &[out, in] : setters) {
871 if (!out->is_match(in) &&
872 !out_registers.contains(out.get())) {
873 graph::shared_leaf<T, SAFE_MATH> a = out->compile(source_buffer,
874 registers,
875 indices,
876 usage);
877 source_buffer << " "
878 << jit::to_string('v', in.get())
879 << "[";
880 if (state.get()) {
881 source_buffer << "offset[0] + ";
882 }
883 source_buffer << "index] = ";
884 if constexpr (SAFE_MATH) {
885 if constexpr (jit::complex_scalar<T>) {
886 jit::add_type<T> (source_buffer);
887 source_buffer << " (";
888 source_buffer << "isnan(real(" << registers[a.get()]
889 << ")) ? 0.0 : real("
890 << registers[a.get()]
891 << "), ";
892 source_buffer << "isnan(imag(" << registers[a.get()]
893 << ")) ? 0.0 : imag("
894 << registers[a.get()]
895 << "));" << std::endl;
896 } else {
897 source_buffer << "isnan(" << registers[a.get()]
898 << ") ? 0.0 : " << registers[a.get()]
899 << ";" << std::endl;
900 }
901 } else {
902 source_buffer << registers[a.get()] << ";" << std::endl;
903 }
904 out_registers.insert(out.get());
905 }
906 }
907
908 for (auto &out : outputs) {
909 if (!graph::variable_cast(out).get() &&
910 !out_registers.contains(out.get())) {
911 graph::shared_leaf<T, SAFE_MATH> a = out->compile(source_buffer,
912 registers,
913 indices,
914 usage);
915 source_buffer << " "
916 << jit::to_string('o', out.get())
917 << "[";
918 if (state.get()) {
919 source_buffer << "offset[0] + ";
920 }
921 source_buffer << "index] = ";
922 if constexpr (SAFE_MATH) {
923 if constexpr (jit::complex_scalar<T>) {
924 jit::add_type<T> (source_buffer);
925 source_buffer << " (";
926 source_buffer << "isnan(real(" << registers[a.get()]
927 << ")) ? 0.0 : real("
928 << registers[a.get()]
929 << "), ";
930 source_buffer << "isnan(imag(" << registers[a.get()]
931 << ")) ? 0.0 : imag("
932 << registers[a.get()]
933 << "));" << std::endl;
934 } else {
935 source_buffer << "isnan(" << registers[a.get()]
936 << ") ? 0.0 : " << registers[a.get()]
937 << ";" << std::endl;
938 }
939 } else {
940 source_buffer << registers[a.get()] << ";" << std::endl;
941 }
942 out_registers.insert(out.get());
943 }
944 }
945
946 source_buffer << " }" << std::endl << "}" << std::endl;
947 }
948
949//------------------------------------------------------------------------------
954//------------------------------------------------------------------------------
955 void create_reduction(std::ostringstream &source_buffer,
956 const size_t size) {
957 source_buffer << std::endl;
958 source_buffer << "extern \"C\" __global__ void max_reduction(" << std::endl;
959 source_buffer << " const ";
960 jit::add_type<T> (source_buffer);
961 source_buffer << " * __restrict__ input," << std::endl;
962 source_buffer << " ";
963 jit::add_type<T> (source_buffer);
964 source_buffer << " * __restrict__ result) {" << std::endl;
965 source_buffer << " const unsigned int i = threadIdx.x;" << std::endl;
966 source_buffer << " const unsigned int j = threadIdx.x/32;" << std::endl;
967 source_buffer << " const unsigned int k = threadIdx.x%32;" << std::endl;
968 source_buffer << " if (i < " << size << ") {" << std::endl;
969 source_buffer << " " << jit::type_to_string<T> () << " sub_max = ";
970 if constexpr (jit::complex_scalar<T>) {
971 source_buffer << "abs(input[i]);" << std::endl;
972 } else {
973 source_buffer << "input[i];" << std::endl;
974 }
975 source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl;
976 if constexpr (jit::complex_scalar<T>) {
977 source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl;
978 } else {
979 source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl;
980 }
981 source_buffer << " }" << std::endl;
982 source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl;
983 source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl;
984 source_buffer << " sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
985 source_buffer << " }" << std::endl;
986 source_buffer << " thread_max[j] = sub_max;" << std::endl;
987 source_buffer << " __syncthreads();" << std::endl;
988 source_buffer << " if (j == 0) {" << std::endl;
989 source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl;
990 source_buffer << " thread_max[k] = max(thread_max[k], __shfl_down_sync(__activemask(), thread_max[k], index));" << std::endl;
991 source_buffer << " }" << std::endl;
992 source_buffer << " *result = thread_max[0];" << std::endl;
993 source_buffer << " }" << std::endl;
994 source_buffer << " }" << std::endl;
995 source_buffer << "}" << std::endl << std::endl;
996 }
997
998//------------------------------------------------------------------------------
1002//------------------------------------------------------------------------------
1004 return reinterpret_cast<T *> (kernel_arguments[node.get()]);
1005 }
1006 };
1007}
1008
1009#endif /* cuda_context_h */
Class representing a generic buffer.
Definition backend.hpp:29
Class representing a cuda gpu context.
Definition cuda_context.hpp:73
T * get_buffer(graph::shared_leaf< T, SAFE_MATH > &node)
Get the buffer for a node.
Definition cuda_context.hpp:1003
~cuda_context()
Cuda context destructor.
Definition cuda_context.hpp:153
int remaining_const_memory
Remaining constant memory in bytes.
Definition cuda_context.hpp:114
void copy_to_host(graph::shared_leaf< T, SAFE_MATH > node, T *destination)
Copy buffer contents to host.
Definition cuda_context.hpp:638
void create_kernel_postfix(std::ostringstream &source_buffer, graph::output_nodes< T, SAFE_MATH > &outputs, graph::map_nodes< T, SAFE_MATH > &setters, graph::shared_random_state< T, SAFE_MATH > state, jit::register_map &registers, jit::register_map &indices, const jit::register_usage &usage)
Create kernel postfix.
Definition cuda_context.hpp:862
void create_reduction(std::ostringstream &source_buffer, const size_t size)
Create reduction.
Definition cuda_context.hpp:955
std::function< T(void)> create_max_call(graph::shared_leaf< T, SAFE_MATH > &argument, std::function< void(void)> run)
Create a max compute kernel calling function.
Definition cuda_context.hpp:540
std::function< void(void)> create_kernel_call(const std::string kernel_name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t num_rays, const jit::texture1d_list &tex1d_list, const jit::texture2d_list &tex2d_list)
Create a kernel calling function.
Definition cuda_context.hpp:316
cuda_context(const size_t index)
Cuda context constructor.
Definition cuda_context.hpp:139
static size_t max_concurrency()
Get the maximum number of concurrent instances.
Definition cuda_context.hpp:121
void copy_to_device(graph::shared_leaf< T, SAFE_MATH > node, T *source)
Copy buffer contents to the device.
Definition cuda_context.hpp:625
static std::string device_type()
Device description.
Definition cuda_context.hpp:130
void print_results(const size_t index, const graph::output_nodes< T, SAFE_MATH > &nodes)
Print out the results.
Definition cuda_context.hpp:592
static constexpr size_t random_state_size
Size of random state needed.
Definition cuda_context.hpp:111
void create_kernel_prefix(std::ostringstream &source_buffer, const std::string name, graph::input_nodes< T, SAFE_MATH > &inputs, graph::output_nodes< T, SAFE_MATH > &outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t size, const std::vector< bool > &is_constant, jit::register_map &registers, const jit::register_usage &usage, jit::texture1d_list &textures1d, jit::texture2d_list &textures2d)
Create kernel prefix.
Definition cuda_context.hpp:713
T check_value(const size_t index, const graph::shared_leaf< T, SAFE_MATH > &node)
Check the value.
Definition cuda_context.hpp:613
void wait()
Hold the current thread until the stream has completed.
Definition cuda_context.hpp:581
void create_header(std::ostringstream &source_buffer)
Create the source header.
Definition cuda_context.hpp:650
void compile(const std::string kernel_source, std::vector< std::string > names, const bool add_reduction=false)
Compile the kernels.
Definition cuda_context.hpp:194
Complex scalar concept.
Definition register.hpp:24
Double base concept.
Definition register.hpp:42
float base concept.
Definition register.hpp:37
#define MAX_REG
Maximum number of registers to use.
Definition cuda_context.hpp:21
subroutine assert(test, message)
Assert check.
Definition f_binding_test.f90:38
Name space for backend buffers.
Definition backend.hpp:19
Name space for GPU backends.
Definition cpu_context.hpp:51
std::shared_ptr< random_state_node< T, SAFE_MATH > > shared_random_state
Convenience type alias for shared sqrt nodes.
Definition random.hpp:263
std::vector< shared_variable< T, SAFE_MATH > > input_nodes
Convenience type alias for a vector of inputs.
Definition node.hpp:1711
shared_variable< T, SAFE_MATH > variable_cast(shared_leaf< T, SAFE_MATH > x)
Cast to a variable node.
Definition node.hpp:1727
std::shared_ptr< leaf_node< T, SAFE_MATH > > shared_leaf
Convenience type alias for shared leaf nodes.
Definition node.hpp:676
std::vector< std::pair< shared_leaf< T, SAFE_MATH >, shared_variable< T, SAFE_MATH > > > map_nodes
Convenience type alias for mapping end codes back to inputs.
Definition node.hpp:1715
std::vector< shared_leaf< T, SAFE_MATH > > output_nodes
Convenience type alias for a vector of output nodes.
Definition node.hpp:691
std::map< void *, size_t > texture1d_list
Type alias for indexing 1D textures.
Definition register.hpp:263
std::map< void *, std::array< size_t, 2 > > texture2d_list
Type alias for indexing 2D textures.
Definition register.hpp:265
std::map< void *, size_t > register_usage
Type alias for counting register usage.
Definition register.hpp:259
std::map< void *, std::string > register_map
Type alias for mapping node pointers to register names.
Definition register.hpp:257
std::string to_string(const char prefix, const NODE *pointer)
Convert a graph::leaf_node pointer to a string.
Definition register.hpp:246
Name space for output files.
Definition output.hpp:16
Random constants and distributions.