17#elif defined(USE_CUDA)
33#define START_GPU @autoreleasepool {
48 template<
float_scalar T,
bool SAFE_MATH=false>
52 std::ostringstream source_buffer;
56 std::vector<std::string> kernel_names;
58 std::map<std::string, texture1d_list> kernel_1dtextures;
60 std::map<std::string, texture2d_list> kernel_2dtextures;
63 using gpu_context_type =
typename std::conditional<use_gpu<T> (),
66#elif defined(USE_METAL)
74 gpu_context_type gpu_context;
86 const size_t num = gpu_context_type::max_concurrency();
87 std::cout <<
"Located " << num <<
" "
88 << gpu_context_type::device_type() <<
" device"
89 << (num == 1 ?
"." :
"s.")
99 context(
const size_t index) : gpu_context(index) {
100 source_buffer << std::setprecision(max_digits10<T> ());
101 gpu_context.create_header(source_buffer);
122 kernel_names.push_back(name);
124 std::vector<bool> is_constant(inputs.size(),
true);
129 for (
auto &[out, in] : setters) {
130 auto found = std::distance(inputs.begin(),
131 std::find(inputs.begin(),
133 if (found < is_constant.size()) {
134 is_constant[found] =
false;
136 out->compile_preamble(source_buffer, registers,
138 kernel_1dtextures[name],
139 kernel_2dtextures[name],
140 gpu_context.remaining_const_memory);
142 for (
auto &out : outputs) {
143 out->compile_preamble(source_buffer, registers,
145 kernel_1dtextures[name],
146 kernel_2dtextures[name],
147 gpu_context.remaining_const_memory);
150 for (
auto &in : inputs) {
151 if (usage.find(in.get()) == usage.end()) {
156 gpu_context.create_kernel_prefix(source_buffer,
157 name, inputs, outputs, state,
160 kernel_1dtextures[name],
161 kernel_2dtextures[name]);
164 for (
auto &[out, in] : setters) {
165 out->compile(source_buffer, registers, indices, usage);
167 for (
auto &out : outputs) {
168 out->compile(source_buffer, registers, indices, usage);
171 gpu_context.create_kernel_postfix(source_buffer, outputs,
173 registers, indices, usage);
176 std::vector<void *> removed_elements;
177 for (
auto &[key, value] : registers) {
178 if (value[0] ==
'r') {
179 removed_elements.push_back(key);
183 for (
auto &key : removed_elements) {
184 registers.erase(key);
194 gpu_context.create_reduction(source_buffer, size);
201 std::cout << std::endl << source_buffer.str() << std::endl;
208 std::string source = source_buffer.str();
209 std::ostringstream filename;
210 filename << std::hash<std::string> {} (source)
211 << std::hash<std::thread::id>{}(std::this_thread::get_id());
214 }
else if constexpr (use_metal<T> ()) {
215 filename <<
".metal";
220 std::ofstream outFile(filename.str());
230 void compile(
const bool add_reduction=
false) {
231#ifdef SAVE_KERNEL_SOURCE
234 gpu_context.compile(source_buffer.str(),
253 const size_t num_rays) {
254 return gpu_context.create_kernel_call(kernel_name, inputs, outputs, state, num_rays,
255 kernel_1dtextures[kernel_name],
256 kernel_2dtextures[kernel_name]);
267 std::function<
void(
void)> run) {
268 return gpu_context.create_max_call(argument, run);
279 gpu_context.print_results(index, nodes);
291 return gpu_context.check_value(index, node);
309 gpu_context.copy_to_device(node, source);
320 gpu_context.copy_to_host(node, destination);
329 return gpu_context.get_buffer(node);
Class representing a cpu context.
Definition cpu_context.hpp:82
Class representing a cuda gpu context.
Definition cuda_context.hpp:73
Class representing a metal gpu context.
Definition metal_context.hpp:25
Class for JIT compile of the GPU kernels.
Definition jit.hpp:49
void compile(const bool add_reduction=false)
Compile the kernel.
Definition jit.hpp:230
static constexpr size_t random_state_size
Size of random state needed.
Definition jit.hpp:78
static size_t max_concurrency()
Get the maximum number of concurrent instances.
Definition jit.hpp:85
void wait()
Wait for kernel to finish.
Definition jit.hpp:297
context(const size_t index)
Construct a jit context object.
Definition jit.hpp:99
std::function< T(void)> create_max_call(graph::shared_leaf< T, SAFE_MATH > &argument, std::function< void(void)> run)
Create a max compute kernel calling function.
Definition jit.hpp:266
void add_max_reduction(const size_t size)
Add max reduction kernel.
Definition jit.hpp:193
void save_source()
Save the kernel source code.
Definition jit.hpp:207
std::function< void(void)> create_kernel_call(const std::string kernel_name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::shared_random_state< T, SAFE_MATH > state, const size_t num_rays)
Create a kernel calling function.
Definition jit.hpp:249
void print(const size_t index, const graph::output_nodes< T, SAFE_MATH > &nodes)
Print output.
Definition jit.hpp:277
void copy_to_device(graph::shared_leaf< T, SAFE_MATH > &node, T *source)
Copy contexts of buffer to device.
Definition jit.hpp:307
T * get_buffer(graph::shared_leaf< T, SAFE_MATH > &node)
Get buffer from the gpu_context.
Definition jit.hpp:328
T check_value(const size_t index, const graph::shared_leaf< T, SAFE_MATH > &node)
Check the value.
Definition jit.hpp:289
void add_kernel(const std::string name, graph::input_nodes< T, SAFE_MATH > inputs, graph::output_nodes< T, SAFE_MATH > outputs, graph::map_nodes< T, SAFE_MATH > setters, graph::shared_random_state< T, SAFE_MATH > state, const size_t size)
Add a kernel.
Definition jit.hpp:116
void print_source()
Print the kernel source.
Definition jit.hpp:200
void copy_to_host(graph::shared_leaf< T, SAFE_MATH > &node, T *destination)
Copy contexts of buffer to host.
Definition jit.hpp:318
Cuda context for metal based gpus.
Metal context for metal based gpus.
std::shared_ptr< random_state_node< T, SAFE_MATH > > shared_random_state
Convenience type alias for shared sqrt nodes.
Definition random.hpp:272
std::vector< shared_variable< T, SAFE_MATH > > input_nodes
Convenience type alias for a vector of inputs.
Definition node.hpp:1730
std::shared_ptr< leaf_node< T, SAFE_MATH > > shared_leaf
Convenience type alias for shared leaf nodes.
Definition node.hpp:673
std::vector< std::pair< shared_leaf< T, SAFE_MATH >, shared_variable< T, SAFE_MATH > > > map_nodes
Convenience type alias for maping end codes back to inputs.
Definition node.hpp:1734
std::vector< shared_leaf< T, SAFE_MATH > > output_nodes
Convenience type alias for a vector of output nodes.
Definition node.hpp:688
Name space for JIT functions.
Definition jit.hpp:41
std::map< void *, size_t > texture1d_list
Type alias for indexing 1D textures.
Definition register.hpp:262
std::map< void *, std::array< size_t, 2 > > texture2d_list
Type alias for indexing 2D textures.
Definition register.hpp:264
std::map< void *, size_t > register_usage
Type alias for counting register usage.
Definition register.hpp:258
std::map< void *, std::string > register_map
Type alias for mapping node pointers to register names.
Definition register.hpp:256
constexpr bool use_cuda()
Test to use Cuda.
Definition register.hpp:67
std::set< void * > visiter_map
Type alias for listing visited nodes.
Definition register.hpp:260