Graph Framework
Loading...
Searching...
No Matches
cuda_context.hpp
Go to the documentation of this file.
1//------------------------------------------------------------------------------
6//------------------------------------------------------------------------------
7
8#ifndef cuda_context_h
9#define cuda_context_h
10
11#include <unordered_set>
12#include <array>
13#include <cstring>
14
15#include <cuda.h>
16#include <nvrtc.h>
17
18#include "random.hpp"
19
21#define MAX_REG 128
22
23namespace gpu {
24//------------------------------------------------------------------------------
29//------------------------------------------------------------------------------
30 static void check_nvrtc_error(nvrtcResult result,
31 const std::string &name) {
32#ifndef NDEBUG
33 assert(result == NVRTC_SUCCESS && nvrtcGetErrorString(result));
34#endif
35 }
36
37//------------------------------------------------------------------------------
42//------------------------------------------------------------------------------
43 static void check_error(CUresult result,
44 const std::string &name) {
45#ifndef NDEBUG
46 const char *error;
47 cuGetErrorString(result, &error);
48 if (result != CUDA_SUCCESS) {
49 std::cerr << name << " " << std::string(error) << std::endl;
50 }
51 assert(result == CUDA_SUCCESS && error);
52#endif
53 }
54
55//------------------------------------------------------------------------------
57//------------------------------------------------------------------------------
58 static CUresult cuda_init() {
59 const CUresult result = cuInit(0);
60 check_error(result, "cuInit");
61 return result;
62 }
64 static const CUresult result = cuda_init();
65
66//------------------------------------------------------------------------------
71//------------------------------------------------------------------------------
72 template<jit::float_scalar T, bool SAFE_MATH=false>
74 private:
76 CUdevice device;
78 CUcontext context;
80 CUmodule module;
82 std::map<graph::leaf_node<T, SAFE_MATH> *, CUdeviceptr> kernel_arguments;
83#ifdef USE_CUDA_TEXTURES
85 std::map<void *, CUtexObject> texture_arguments;
86#endif
88 CUdeviceptr result_buffer;
90 CUdeviceptr offset_buffer;
92 CUstream stream;
93
94//------------------------------------------------------------------------------
99//------------------------------------------------------------------------------
100 void check_error_async(CUresult result,
101 const std::string &name) {
102 check_error(result, name);
103#ifndef NDEBUG
104 std::string async_name = name + "_async";
105 check_error(cuStreamSynchronize(stream), async_name);
106#endif
107 }
108
109 public:
111 constexpr static size_t random_state_size = 1024;
112
115
116//------------------------------------------------------------------------------
120//------------------------------------------------------------------------------
121 static size_t max_concurrency() {
122 int count;
123 check_error(cuDeviceGetCount(&count), "cuDeviceGetCount");
124 return count;
125 }
126
127//------------------------------------------------------------------------------
129//------------------------------------------------------------------------------
130 static std::string device_type() {
131 return "Cuda GPU";
132 }
133
134//------------------------------------------------------------------------------
138//------------------------------------------------------------------------------
139 cuda_context(const size_t index) : result_buffer(0), module(0), offset_buffer(0) {
140 check_error(cuDeviceGet(&device, index), "cuDeviceGet");
141 check_error(cuDevicePrimaryCtxRetain(&context, device), "cuDevicePrimaryCtxRetain");
142 check_error(cuCtxSetCurrent(context), "cuCtxSetCurrent");
143 check_error(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_L1), "cuCtxSetCacheConfig");
144 check_error(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "cuStreamCreate");
145 check_error(cuDeviceGetAttribute(&remaining_const_memory,
146 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
147 device), "cuDeviceGetAttribute");
148 }
149
150//------------------------------------------------------------------------------
152//------------------------------------------------------------------------------
154 if (module) {
155 check_error(cuModuleUnload(module), "cuModuleUnload");
156 module = 0;
157 }
158
159 for (auto &[key, value] : kernel_arguments) {
160 check_error(cuMemFree(value), "cuMemFree");
161 }
162
163#ifdef USE_CUDA_TEXTURES
164 for (auto &[key, value] : texture_arguments) {
165 CUDA_RESOURCE_DESC resource;
166 check_error(cuTexObjectGetResourceDesc(&resource, value),
167 "cuTexObjectGetResourceDesc");
168
169 check_error(cuArrayDestroy(resource.res.array.hArray), "cuArrayDestroy");
170 check_error(cuTexObjectDestroy(value), "cuTexObjectDestroy");
171 }
172#endif
173
174 if (result_buffer) {
175 check_error(cuMemFree(result_buffer), "cuMemFree");
176 result_buffer = 0;
177 }
178 if (offset_buffer) {
179 check_error(cuMemFree(offset_buffer), "cuMemFree");
180 offset_buffer = 0;
181 }
182
183 check_error(cuStreamDestroy(stream), "cuStreamDestroy");
184 check_error(cuDevicePrimaryCtxRelease(device), "cuDevicePrimaryCtxRelease");
185 }
186
187//------------------------------------------------------------------------------
193//------------------------------------------------------------------------------
194 void compile(const std::string kernel_source,
195 std::vector<std::string> names,
196 const bool add_reduction=false) {
197 if (add_reduction) {
198 names.push_back("max_reduction");
199 }
200
201 nvrtcProgram kernel_program;
202 check_nvrtc_error(nvrtcCreateProgram(&kernel_program,
203 kernel_source.c_str(),
204 NULL, 0, NULL, NULL),
205 "nvrtcCreateProgram");
206
207 for (std::string &name : names) {
208 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
209 name.c_str()),
210 "nvrtcAddNameExpression");
211 }
212 if (add_reduction) {
213 check_nvrtc_error(nvrtcAddNameExpression(kernel_program,
214 "max_reduction"),
215 "nvrtcAddNameExpression");
216 }
217
218 std::ostringstream arch;
219 int compute_version;
220 check_error(cuDeviceGetAttribute(&compute_version,
221 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
222 device), "cuDeviceGetAttribute");
223 arch << "--gpu-architecture=compute_";
224 arch << compute_version;
225 if (jit::verbose) {
226 std::cout << "CUDA GPU info." << std::endl;
227 std::cout << " Major compute capability : " << compute_version << std::endl;
228 }
229
230 check_error(cuDeviceGetAttribute(&compute_version,
231 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
232 device), "cuDeviceGetAttribute");
233 arch << compute_version;
234 if (jit::verbose) {
235 std::cout << " Minor compute capability : " << compute_version << std::endl;
236 }
237
238 char device_name[100];
239 check_error(cuDeviceGetName(device_name, 100, device), "cuDeviceGetName");
240 if (jit::verbose) {
241 std::cout << " Device name : " << device_name << std::endl;
242 }
243
244 const std::string temp = arch.str();
245 std::array<const char *, 8> options({
246 temp.c_str(),
247 "--std=c++17",
248 "--relocatable-device-code=false",
249 "--include-path=" CUDA_INCLUDE,
250 "--include-path=" HEADER_DIR,
251 "--extra-device-vectorization",
252 "--device-as-default-execution-space",
253 "--use_fast_math"
254 });
255
256 if (nvrtcCompileProgram(kernel_program, options.size(), options.data())) {
257 size_t log_size;
258 check_nvrtc_error(nvrtcGetProgramLogSize(kernel_program, &log_size),
259 "nvrtcGetProgramLogSize");
260
261 char *log = static_cast<char *> (malloc(log_size));
262 check_nvrtc_error(nvrtcGetProgramLog(kernel_program, log),
263 "nvrtcGetProgramLog");
264 std::cout << log << std::endl;
265 free(log);
266 std::cout << kernel_source << std::endl;
267 }
268
269 check_error(cuDeviceGetAttribute(&compute_version,
270 CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
271 device), "cuDeviceGetAttribute");
272 if (jit::verbose) {
273 std::cout << " Managed Memory : " << compute_version << std::endl;
274 }
275
276 size_t ptx_size;
277 check_nvrtc_error(nvrtcGetPTXSize(kernel_program, &ptx_size),
278 "nvrtcGetPTXSize");
279
280 char *ptx = static_cast<char *> (malloc(ptx_size));
281 check_nvrtc_error(nvrtcGetPTX(kernel_program, ptx), "nvrtcGetPTX");
282
283 check_nvrtc_error(nvrtcDestroyProgram(&kernel_program),
284 "nvrtcDestroyProgram");
285
286 std::array<CUjit_option, 3> module_options = {
287 CU_JIT_MAX_REGISTERS,
288 CU_JIT_LTO,
289 CU_JIT_POSITION_INDEPENDENT_CODE
290 };
291 std::array<void *, 3> module_values = {
292 reinterpret_cast<void *> (MAX_REG),
293 reinterpret_cast<void *> (1),
294 reinterpret_cast<void *> (0)
295 };
296
297 check_error(cuModuleLoadDataEx(&module, ptx, module_options.size(),
298 module_options.data(),
299 module_values.data()), "cuModuleLoadDataEx");
300
301 free(ptx);
302 }
303
304//------------------------------------------------------------------------------
315//------------------------------------------------------------------------------
316 std::function<void(void)> create_kernel_call(const std::string kernel_name,
320 const size_t num_rays,
321 const jit::texture1d_list &tex1d_list,
322 const jit::texture2d_list &tex2d_list) {
323 CUfunction function;
324 check_error(cuModuleGetFunction(&function, module, kernel_name.c_str()), "cuModuleGetFunction");
325
326 std::vector<void *> buffers;
327
328 const size_t buffer_element_size = sizeof(T);
329 for (auto &input : inputs) {
330 if (!kernel_arguments.contains(input.get())) {
331 kernel_arguments.try_emplace(input.get());
332 const backend::buffer<T> backend = input->evaluate();
333 check_error(cuMemAllocManaged(&kernel_arguments[input.get()],
334 backend.size()*sizeof(T),
335 CU_MEM_ATTACH_GLOBAL),
336 "cuMemAllocManaged");
337 check_error(cuMemcpyHtoD(kernel_arguments[input.get()],
338 &backend[0],
339 backend.size()*sizeof(T)),
340 "cuMemcpyHtoD");
341 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[input.get()]));
342 }
343 }
344 for (auto &output : outputs) {
345 if (!kernel_arguments.contains(output.get())) {
346 kernel_arguments.try_emplace(output.get());
347 check_error(cuMemAllocManaged(&kernel_arguments[output.get()],
348 num_rays*sizeof(T),
349 CU_MEM_ATTACH_GLOBAL),
350 "cuMemAllocManaged");
351 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[output.get()]));
352 }
353 }
354
355 const size_t num_buffers = buffers.size();
356 if (state.get()) {
357 if (!kernel_arguments.contains(state.get())) {
358 kernel_arguments.try_emplace(state.get());
359 check_error(cuMemAllocManaged(&kernel_arguments[state.get()],
360 state->get_size_bytes(),
361 CU_MEM_ATTACH_GLOBAL),
362 "cuMemAllocManaged");
363 check_error(cuMemAlloc(&offset_buffer, sizeof(uint32_t)), "cuMemAlloc");
364 check_error(cuMemcpyHtoD(kernel_arguments[state.get()],
365 state->data(),
366 state->get_size_bytes()),
367 "cuMemcpyHtoD");
368 }
369 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[state.get()]));
370 buffers.push_back(reinterpret_cast<void *> (&offset_buffer));
371 }
372
373#ifdef USE_CUDA_TEXTURES
374 for (auto &[data, size] : tex1d_list) {
375 if (!texture_arguments.contains(data)) {
376 texture_arguments.try_emplace(data);
377 CUDA_RESOURCE_DESC resource_desc;
378 CUDA_TEXTURE_DESC texture_desc;
379 CUDA_ARRAY_DESCRIPTOR array_desc;
380
381 array_desc.Width = size;
382 array_desc.Height = 1;
383
384 memset(&resource_desc, 0, sizeof(CUDA_RESOURCE_DESC));
385 memset(&texture_desc, 0, sizeof(CUDA_TEXTURE_DESC));
386
387 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
388 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
389 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
390 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
391 if constexpr (jit::float_base<T>) {
392 array_desc.Format = CU_AD_FORMAT_FLOAT;
393 if constexpr (jit::complex_scalar<T>) {
394 array_desc.NumChannels = 2;
395 } else {
396 array_desc.NumChannels = 1;
397 }
398 } else {
399 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
400 if constexpr (jit::complex_scalar<T>) {
401 array_desc.NumChannels = 4;
402 } else {
403 array_desc.NumChannels = 2;
404 }
405 }
406 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
407 "cuArrayCreate");
408 check_error(cuMemcpyHtoA(resource_desc.res.array.hArray, 0, data,
409 size*sizeof(float)*array_desc.NumChannels),
410 "cuMemcpyHtoA");
411
412 check_error(cuTexObjectCreate(&texture_arguments[data],
413 &resource_desc, &texture_desc,
414 NULL),
415 "cuTexObjectCreate");
416 }
417 buffers.push_back(reinterpret_cast<void *> (&texture_arguments[data]));
418 }
419 for (auto &[data, size] : tex2d_list) {
420 if (!texture_arguments.contains(data)) {
421 texture_arguments.try_emplace(data);
422 CUDA_RESOURCE_DESC resource_desc;
423 CUDA_TEXTURE_DESC texture_desc;
424 CUDA_ARRAY_DESCRIPTOR array_desc;
425
426 array_desc.Width = size[0];
427 array_desc.Height = size[1];
428
429 memset(&resource_desc, 0, sizeof(CUDA_RESOURCE_DESC));
430 memset(&texture_desc, 0, sizeof(CUDA_TEXTURE_DESC));
431
432 resource_desc.resType = CU_RESOURCE_TYPE_ARRAY;
433 texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_BORDER;
434 texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_BORDER;
435 texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_BORDER;
436 const size_t total = size[0]*size[1];
437 if constexpr (jit::float_base<T>) {
438 array_desc.Format = CU_AD_FORMAT_FLOAT;
439 if constexpr (jit::complex_scalar<T>) {
440 array_desc.NumChannels = 2;
441 } else {
442 array_desc.NumChannels = 1;
443 }
444 } else {
445 array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
446 if constexpr (jit::complex_scalar<T>) {
447 array_desc.NumChannels = 4;
448 } else {
449 array_desc.NumChannels = 2;
450 }
451 }
452 check_error(cuArrayCreate(&resource_desc.res.array.hArray, &array_desc),
453 "cuArrayCreate");
454
455 CUDA_MEMCPY2D copy_desc;
456 memset(&copy_desc, 0, sizeof(copy_desc));
457
458 copy_desc.srcPitch = size[0]*sizeof(float)*array_desc.NumChannels;
459 copy_desc.srcMemoryType = CU_MEMORYTYPE_HOST;
460 copy_desc.srcHost = data;
461
462 copy_desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
463 copy_desc.dstArray = resource_desc.res.array.hArray;
464
465 copy_desc.WidthInBytes = copy_desc.srcPitch;
466 copy_desc.Height = size[0];
467
468 check_error(cuMemcpy2D(&copy_desc), "cuMemcpy2D");
469
470 check_error(cuTexObjectCreate(&texture_arguments[data],
471 &resource_desc, &texture_desc,
472 NULL),
473 "cuTexObjectCreate");
474 }
475 buffers.push_back(reinterpret_cast<void *> (&texture_arguments[data]));
476 }
477#endif
478
479 int value;
480 check_error(cuFuncGetAttribute(&value, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
481 function), "cuFuncGetAttribute");
482 unsigned int threads_per_group = value;
483 unsigned int thread_groups = num_rays/threads_per_group + (num_rays%threads_per_group ? 1 : 0);
484
485 int min_grid;
486 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
487 "cuOccupancyMaxPotentialBlockSize");
488
489 if (jit::verbose) {
490 std::cout << " Kernel name : " << kernel_name << std::endl;
491 std::cout << " Threads per group : " << threads_per_group << std::endl;
492 std::cout << " Number of groups : " << thread_groups << std::endl;
493 std::cout << " Total problem size : " << threads_per_group*thread_groups << std::endl;
494 std::cout << " Min grid size : " << min_grid << std::endl;
495 std::cout << " Suggested Block size : " << value << std::endl;
496 }
497
498 if (state.get()) {
499 return [this, num_rays, function, threads_per_group, buffers] () mutable {
500 for (uint32_t i = 0; i < num_rays; i += threads_per_group) {
501 check_error_async(cuStreamWriteValue32(stream, offset_buffer, i,
502 CU_STREAM_WRITE_VALUE_DEFAULT),
503 "cuStreamWriteValue32");
504 check_error_async(cuLaunchKernel(function,
505 1, 1, 1,
506 threads_per_group, 1, 1,
507 0, stream,
508 buffers.data(), NULL),
509 "cuLaunchKernel");
510 }
511 };
512 } else {
513 return [this, function, thread_groups, threads_per_group, buffers] () mutable {
514 check_error_async(cuLaunchKernel(function, thread_groups, 1, 1,
515 threads_per_group, 1, 1, 0, stream,
516 buffers.data(), NULL),
517 "cuLaunchKernel");
518 };
519 }
520 }
521
522//------------------------------------------------------------------------------
528//------------------------------------------------------------------------------
529 std::function<T(void)> create_max_call(graph::shared_leaf<T, SAFE_MATH> &argument,
530 std::function<void(void)> run) {
531 check_error(cuMemAllocManaged(&result_buffer, sizeof(T),
532 CU_MEM_ATTACH_GLOBAL),
533 "cuMemAllocManaged");
534
535 std::vector<void *> buffers;
536
537 buffers.push_back(reinterpret_cast<void *> (&kernel_arguments[argument.get()]));
538 buffers.push_back(reinterpret_cast<void *> (&result_buffer));
539
540 CUfunction function;
541 check_error(cuModuleGetFunction(&function, module, "max_reduction"),
542 "cuModuleGetFunction");
543
544 int value;
545 int min_grid;
546 check_error(cuOccupancyMaxPotentialBlockSize(&min_grid, &value, function, 0, 0, 0),
547 "cuOccupancyMaxPotentialBlockSize");
548
549 if (jit::verbose) {
550 std::cout << " Kernel name : max_reduction" << std::endl;
551 std::cout << " Min grid size : " << min_grid << std::endl;
552 std::cout << " Suggested Block size : " << value << std::endl;
553 }
554
555 return [this, function, run, buffers] () mutable {
556 run();
557 check_error_async(cuLaunchKernel(function, 1, 1, 1,
558 1024, 1, 1, 0, stream,
559 buffers.data(), NULL),
560 "cuLaunchKernel");
561 wait();
562
563 return reinterpret_cast<T *> (result_buffer)[0];
564 };
565 }
566
567//------------------------------------------------------------------------------
569//------------------------------------------------------------------------------
570 void wait() {
571 check_error_async(cuStreamSynchronize(stream), "cuStreamSynchronize");
572 check_error(cuCtxSynchronize(), "cuCtxSynchronize");
573 }
574
575//------------------------------------------------------------------------------
580//------------------------------------------------------------------------------
581 void print_results(const size_t index,
583 wait();
584 for (auto &out : nodes) {
585 const T temp = reinterpret_cast<T *> (kernel_arguments[out.get()])[index];
586 if constexpr (jit::complex_scalar<T>) {
587 std::cout << std::real(temp) << " " << std::imag(temp) << " ";
588 } else {
589 std::cout << temp << " ";
590 }
591 }
592 std::cout << std::endl;
593 }
594
595//------------------------------------------------------------------------------
601//------------------------------------------------------------------------------
602 T check_value(const size_t index,
604 wait();
605 return reinterpret_cast<T *> (kernel_arguments[node.get()])[index];
606 }
607
608//------------------------------------------------------------------------------
613//------------------------------------------------------------------------------
615 T *source) {
616 size_t size;
617 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]), "cuMemGetAddressRange");
618 check_error_async(cuMemcpyHtoDAsync(kernel_arguments[node.get()], source, size, stream), "cuMemcpyHtoDAsync");
619 }
620
621//------------------------------------------------------------------------------
626//------------------------------------------------------------------------------
628 T *destination) {
629 size_t size;
630 check_error(cuMemGetAddressRange(NULL, &size, kernel_arguments[node.get()]), "cuMemGetAddressRange");
631 check_error_async(cuMemcpyDtoHAsync(destination, kernel_arguments[node.get()], size, stream), "cuMemcpyDtoHAsync");
632 }
633
634//------------------------------------------------------------------------------
638//------------------------------------------------------------------------------
639 void create_header(std::ostringstream &source_buffer) {
640 source_buffer << "typedef unsigned int uint32_t;" << std::endl
641 << "typedef unsigned short uint16_t;" << std::endl
642 << "typedef short int16_t;" << std::endl
643 << "template<typename T, size_t S>" << std::endl
644 << "class array {" << std::endl
645 << "private:" << std::endl
646 << " T _buffer[S];" << std::endl
647 << "public:" << std::endl
648 << " T operator[] (const size_t index) const {" << std::endl
649 << " return _buffer[index];" << std::endl
650 << " }" << std::endl
651 << " T &operator[] (const size_t index) {" << std::endl
652 << " return _buffer[index];" << std::endl
653 << " }" << std::endl
654 << "};" << std::endl;
655 if constexpr (jit::complex_scalar<T>) {
656 source_buffer << "#define CUDA_DEVICE_CODE" << std::endl
657 << "#define M_PI " << M_PI << std::endl
658 << "#include <cuda/std/complex>" << std::endl
659 << "#include <special_functions.hpp>" << std::endl;
660#ifdef USE_CUDA_TEXTURES
661 if constexpr (jit::float_base<T>) {
662 source_buffer << "static __inline__ __device__ complex<float> to_cmp_float(float2 p) {"
663 << std::endl
664 << " return ";
665 jit::add_type<T> (source_buffer);
666 source_buffer << " (p.x, p.y);" << std::endl
667 << "}" << std::endl;
668 } else {
669 source_buffer << "static __inline__ __device__ complex<double> to_cmp_double(uint4 p) {"
670 << std::endl
671 << " return ";
672 jit::add_type<T> (source_buffer);
673 source_buffer << " (__hiloint2double(p.y, p.x), __hiloint2double(p.w, p.z));"
674 << std::endl
675 << "}" << std::endl;
676 }
677 } else if constexpr (jit::double_base<T>) {
678 source_buffer << "static __inline__ __device__ double to_double(uint2 p) {"
679 << std::endl
680 << " return __hiloint2double(p.y, p.x);"
681 << std::endl
682 << "}" << std::endl;
683#endif
684 }
685 }
686
687//------------------------------------------------------------------------------
701//------------------------------------------------------------------------------
702 void create_kernel_prefix(std::ostringstream &source_buffer,
703 const std::string name,
707 const size_t size,
708 const std::vector<bool> &is_constant,
709 jit::register_map &registers,
710 const jit::register_usage &usage,
711 jit::texture1d_list &textures1d,
712 jit::texture2d_list &textures2d) {
713 source_buffer << std::endl;
714 source_buffer << "extern \"C\" __global__ void "
715 << name << "(" << std::endl;
716
717 std::unordered_set<void *> used_args;
718 if (inputs.size()) {
719 source_buffer << " ";
720 if (is_constant[0]) {
721 source_buffer << "const ";
722 }
723 jit::add_type<T> (source_buffer);
724 source_buffer << " * __restrict__ "
725 << jit::to_string('v', inputs[0].get());
726 used_args.insert(inputs[0].get());
727 }
728 for (size_t i = 1, ie = inputs.size(); i < ie; i++) {
729 if (!used_args.contains(inputs[i].get())) {
730 source_buffer << ", // " << inputs[i - 1]->get_symbol()
731#ifndef USE_INPUT_CACHE
732#ifdef SHOW_USE_COUNT
733 << " used " << usage.at(inputs[i - 1].get())
734#endif
735#endif
736 << std::endl;
737 source_buffer << " ";
738 if (is_constant[i]) {
739 source_buffer << "const ";
740 }
741 jit::add_type<T> (source_buffer);
742 source_buffer << " * __restrict__ "
743 << jit::to_string('v', inputs[i].get());
744 used_args.insert(inputs[i].get());
745 }
746 }
747 for (size_t i = 0, ie = outputs.size(); i < ie; i++) {
748 if (i == 0) {
749 if (inputs.size()) {
750 source_buffer << ", // "
751 << inputs[inputs.size() - 1]->get_symbol();
752#ifndef USE_INPUT_CACHE
753#ifdef SHOW_USE_COUNT
754 source_buffer << " used "
755 << usage.at(inputs[inputs.size() - 1].get());
756#endif
757#endif
758 source_buffer << std::endl;
759 }
760 } else {
761 source_buffer << "," << std::endl;
762 }
763
764 if (!used_args.contains(outputs[i].get())) {
765 source_buffer << " ";
766 jit::add_type<T> (source_buffer);
767 source_buffer << " * __restrict__ "
768 << jit::to_string('o', outputs[i].get());
769 used_args.insert(outputs[i].get());
770 }
771 }
772 if (state.get()) {
773 source_buffer << "," << std::endl
774 << " mt_state * __restrict__ "
775 << jit::to_string('s', state.get())
776 << "," << std::endl
777 << " const uint32_t * __restrict__ offset"
778 << std::endl;
779 }
780#ifdef USE_CUDA_TEXTURES
781 for (auto &[key, value] : textures1d) {
782 source_buffer << "," << std::endl;
783 source_buffer << " cudaTextureObject_t "
784 << jit::to_string('a', key);
785 }
786 for (auto &[key, value] : textures2d) {
787 source_buffer << "," << std::endl;
788 source_buffer << " cudaTextureObject_t "
789 << jit::to_string('a', key);
790 }
791#endif
792 source_buffer << ") {" << std::endl
793 << " const int index = blockIdx.x*blockDim.x + threadIdx.x;"
794 << std::endl;
795 if (state.get()) {
796#ifdef USE_INPUT_CACHE
797 registers[state.get()] = jit::to_string('r', state.get());
798 source_buffer << " mt_state &" << registers[state.get()] << " = "
799 << jit::to_string('s', state.get())
800 << "[threadIdx.x];"
801#ifdef SHOW_USE_COUNT
802 << " // used " << usage.at(state.get())
803#endif
804 << std::endl;
805#else
806 registers[state.get()] = jit::to_string('s', state.get()) + "[threadIdx.x]";
807#endif
808 }
809 source_buffer << " if (";
810 if (state.get()) {
811 source_buffer << "offset[0] + ";
812 }
813 source_buffer << "index < " << size << ") {" << std::endl;
814
815
816 for (auto &input : inputs) {
817#ifdef USE_INPUT_CACHE
818 if (usage.at(input.get())) {
819 registers[input.get()] = jit::to_string('r', input.get());
820 source_buffer << " const ";
821 jit::add_type<T> (source_buffer);
822 source_buffer << " " << registers[input.get()] << " = "
823 << jit::to_string('v', input.get())
824 << "[";
825 if (state.get()) {
826 source_buffer << "offset[0] + ";
827 }
828 source_buffer << "index]; // " << input->get_symbol()
829#ifdef SHOW_USE_COUNT
830 << " used " << usage.at(input.get())
831#endif
832 << std::endl;
833 }
834#else
835 registers[input.get()] = jit::to_string('v', input.get()) + "[index]";
836#endif
837 }
838 }
839
840//------------------------------------------------------------------------------
850//------------------------------------------------------------------------------
851 void create_kernel_postfix(std::ostringstream &source_buffer,
855 jit::register_map &registers,
856 jit::register_map &indices,
857 const jit::register_usage &usage) {
858 std::unordered_set<void *> out_registers;
859 for (auto &[out, in] : setters) {
860 if (!out->is_match(in) &&
861 !out_registers.contains(out.get())) {
862 graph::shared_leaf<T, SAFE_MATH> a = out->compile(source_buffer,
863 registers,
864 indices,
865 usage);
866 source_buffer << " "
867 << jit::to_string('v', in.get())
868 << "[";
869 if (state.get()) {
870 source_buffer << "offset[0] + ";
871 }
872 source_buffer << "index] = ";
873 if constexpr (SAFE_MATH) {
874 if constexpr (jit::complex_scalar<T>) {
875 jit::add_type<T> (source_buffer);
876 source_buffer << " (";
877 source_buffer << "isnan(real(" << registers[a.get()]
878 << ")) ? 0.0 : real("
879 << registers[a.get()]
880 << "), ";
881 source_buffer << "isnan(imag(" << registers[a.get()]
882 << ")) ? 0.0 : imag("
883 << registers[a.get()]
884 << "));" << std::endl;
885 } else {
886 source_buffer << "isnan(" << registers[a.get()]
887 << ") ? 0.0 : " << registers[a.get()]
888 << ";" << std::endl;
889 }
890 } else {
891 source_buffer << registers[a.get()] << ";" << std::endl;
892 }
893 out_registers.insert(out.get());
894 }
895 }
896
897 for (auto &out : outputs) {
898 if (!graph::variable_cast(out).get() &&
899 !out_registers.contains(out.get())) {
900 graph::shared_leaf<T, SAFE_MATH> a = out->compile(source_buffer,
901 registers,
902 indices,
903 usage);
904 source_buffer << " "
905 << jit::to_string('o', out.get())
906 << "[";
907 if (state.get()) {
908 source_buffer << "offset[0] + ";
909 }
910 source_buffer << "index] = ";
911 if constexpr (SAFE_MATH) {
912 if constexpr (jit::complex_scalar<T>) {
913 jit::add_type<T> (source_buffer);
914 source_buffer << " (";
915 source_buffer << "isnan(real(" << registers[a.get()]
916 << ")) ? 0.0 : real("
917 << registers[a.get()]
918 << "), ";
919 source_buffer << "isnan(imag(" << registers[a.get()]
920 << ")) ? 0.0 : imag("
921 << registers[a.get()]
922 << "));" << std::endl;
923 } else {
924 source_buffer << "isnan(" << registers[a.get()]
925 << ") ? 0.0 : " << registers[a.get()]
926 << ";" << std::endl;
927 }
928 } else {
929 source_buffer << registers[a.get()] << ";" << std::endl;
930 }
931 out_registers.insert(out.get());
932 }
933 }
934
935 source_buffer << " }" << std::endl << "}" << std::endl;
936 }
937
938//------------------------------------------------------------------------------
943//------------------------------------------------------------------------------
944 void create_reduction(std::ostringstream &source_buffer,
945 const size_t size) {
946 source_buffer << std::endl;
947 source_buffer << "extern \"C\" __global__ void max_reduction(" << std::endl;
948 source_buffer << " const ";
949 jit::add_type<T> (source_buffer);
950 source_buffer << " * __restrict__ input," << std::endl;
951 source_buffer << " ";
952 jit::add_type<T> (source_buffer);
953 source_buffer << " * __restrict__ result) {" << std::endl;
954 source_buffer << " const unsigned int i = threadIdx.x;" << std::endl;
955 source_buffer << " const unsigned int j = threadIdx.x/32;" << std::endl;
956 source_buffer << " const unsigned int k = threadIdx.x%32;" << std::endl;
957 source_buffer << " if (i < " << size << ") {" << std::endl;
958 source_buffer << " " << jit::type_to_string<T> () << " sub_max = ";
959 if constexpr (jit::complex_scalar<T>) {
960 source_buffer << "abs(input[i]);" << std::endl;
961 } else {
962 source_buffer << "input[i];" << std::endl;
963 }
964 source_buffer << " for (size_t index = i + 1024; index < " << size <<"; index += 1024) {" << std::endl;
965 if constexpr (jit::complex_scalar<T>) {
966 source_buffer << " sub_max = max(sub_max, abs(input[index]));" << std::endl;
967 } else {
968 source_buffer << " sub_max = max(sub_max, input[index]);" << std::endl;
969 }
970 source_buffer << " }" << std::endl;
971 source_buffer << " __shared__ " << jit::type_to_string<T> () << " thread_max[32];" << std::endl;
972 source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl;
973 source_buffer << " sub_max = max(sub_max, __shfl_down_sync(__activemask(), sub_max, index));" << std::endl;
974 source_buffer << " }" << std::endl;
975 source_buffer << " thread_max[j] = sub_max;" << std::endl;
976 source_buffer << " __syncthreads();" << std::endl;
977 source_buffer << " if (j == 0) {" << std::endl;
978 source_buffer << " for (int index = 16; index > 0; index /= 2) {" << std::endl;
979 source_buffer << " thread_max[k] = max(thread_max[k], __shfl_down_sync(__activemask(), thread_max[k], index));" << std::endl;
980 source_buffer << " }" << std::endl;
981 source_buffer << " *result = thread_max[0];" << std::endl;
982 source_buffer << " }" << std::endl;
983 source_buffer << " }" << std::endl;
984 source_buffer << "}" << std::endl << std::endl;
985 }
986
987//------------------------------------------------------------------------------
991//------------------------------------------------------------------------------
993 return reinterpret_cast<T *> (kernel_arguments[node.get()]);
994 }
995 };
996}
997
998#endif /* cuda_context_h */
Class representing a generic buffer.
Definition backend.hpp:29
Class representing a cuda gpu context.
Definition cuda_context.hpp:73
T * get_buffer(graph::shared_leaf< T, SAFE_MATH > &node)
Get the buffer for a node.
Definition cuda_context.hpp:992
~cuda_context()
Cuda context destructor.
Definition cuda_context.hpp:153
int remaining_const_memory
Remaining constant memory in bytes.
Definition cuda_context.hpp:114
void copy_to_host(graph::shared_leaf< T, SAFE_MATH > node, T *destination)
Copy buffer contents to host.
Definition cuda_context.hpp:627
void create_kernel_postfix(std::ostringstream &source_buffer, graph::output_nodes< T, SAFE_MATH > &outputs, graph::map_nodes< T, SAFE_MATH > &setters, graph::shared_random_state< T, SAFE_MATH > state, jit::register_map &registers, jit::register_map &indices, const jit::register_usage &usage)
Create kernel postfix.
Definition cuda_context.hpp:851
void create_reduction(std::ostringstream &source_buffer, const size_t size)
Create reduction.
Definition cuda_context.hpp:944
std::function< T(void)> create_max_call(graph::shared_leaf< T, SAFE_MATH > &argument, std::function< void(void)> run)
Create a max compute kernel calling function.
Definition cuda_context.hpp:529
std::function< void(void)> create_kernel_call(const std::string kernel_name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t num_rays, const jit::texture1d_list &tex1d_list, const jit::texture2d_list &tex2d_list)
Create a kernel calling function.
Definition cuda_context.hpp:316
cuda_context(const size_t index)
Cuda context constructor.
Definition cuda_context.hpp:139
static size_t max_concurrency()
Get the maximum number of concurrent instances.
Definition cuda_context.hpp:121
void copy_to_device(graph::shared_leaf< T, SAFE_MATH > node, T *source)
Copy buffer contents to the device.
Definition cuda_context.hpp:614
static std::string device_type()
Device discription.
Definition cuda_context.hpp:130
void print_results(const size_t index, const graph::output_nodes< T, SAFE_MATH > &nodes)
Print out the results.
Definition cuda_context.hpp:581
static constexpr size_t random_state_size
Size of random state needed.
Definition cuda_context.hpp:111
void create_kernel_prefix(std::ostringstream &source_buffer, const std::string name, graph::input_nodes< T, SAFE_MATH > &inputs, graph::output_nodes< T, SAFE_MATH > &outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t size, const std::vector< bool > &is_constant, jit::register_map &registers, const jit::register_usage &usage, jit::texture1d_list &textures1d, jit::texture2d_list &textures2d)
Create kernel prefix.
Definition cuda_context.hpp:702
T check_value(const size_t index, const graph::shared_leaf< T, SAFE_MATH > &node)
Check the value.
Definition cuda_context.hpp:602
void wait()
Hold the current thread until the stream has completed.
Definition cuda_context.hpp:570
void create_header(std::ostringstream &source_buffer)
Create the source header.
Definition cuda_context.hpp:639
void compile(const std::string kernel_source, std::vector< std::string > names, const bool add_reduction=false)
Compile the kernels.
Definition cuda_context.hpp:194
Complex scalar concept.
Definition register.hpp:24
Double base concept.
Definition register.hpp:42
float base concept.
Definition register.hpp:37
#define MAX_REG
Maximum number of registers to use.
Definition cuda_context.hpp:21
subroutine assert(test, message)
Assert check.
Definition f_binding_test.f90:38
Name space for backend buffers.
Definition backend.hpp:19
Name space for GPU backends.
Definition cpu_context.hpp:51
std::shared_ptr< random_state_node< T, SAFE_MATH > > shared_random_state
Convenience type alias for shared sqrt nodes.
Definition random.hpp:272
std::vector< shared_variable< T, SAFE_MATH > > input_nodes
Convenience type alias for a vector of inputs.
Definition node.hpp:1730
shared_variable< T, SAFE_MATH > variable_cast(shared_leaf< T, SAFE_MATH > x)
Cast to a variable node.
Definition node.hpp:1746
std::shared_ptr< leaf_node< T, SAFE_MATH > > shared_leaf
Convenience type alias for shared leaf nodes.
Definition node.hpp:673
std::vector< std::pair< shared_leaf< T, SAFE_MATH >, shared_variable< T, SAFE_MATH > > > map_nodes
Convenience type alias for maping end codes back to inputs.
Definition node.hpp:1734
std::vector< shared_leaf< T, SAFE_MATH > > output_nodes
Convenience type alias for a vector of output nodes.
Definition node.hpp:688
std::map< void *, size_t > texture1d_list
Type alias for indexing 1D textures.
Definition register.hpp:262
std::map< void *, std::array< size_t, 2 > > texture2d_list
Type alias for indexing 2D textures.
Definition register.hpp:264
std::map< void *, size_t > register_usage
Type alias for counting register usage.
Definition register.hpp:258
std::map< void *, std::string > register_map
Type alias for mapping node pointers to register names.
Definition register.hpp:256
std::string to_string(const char prefix, const NODE *pointer)
Convert a graph::leaf_node pointer to a string.
Definition register.hpp:245
Name space for output files.
Definition output.hpp:16
Random constants and distributions.