I was working through the Mojo GPU basics, but when I get to the exercise it does not compile and the description does not quite match the solution implementation.
Description mismatch:
Create an host and device buffer for the output of
DType
Int64
, with 8 elements, don’t forget to zero the values withenqueue_memset()
But the implementation still uses DType.float32 for the output buffers.
Exercise solution does not compile:
When trying to build the example, it errors with:
error: ‘DeviceBuffer[float32]’ is not subscriptable, it does not implement the
__getitem__
/__setitem__
methods
output_buffer[block_idx.x] = value
~~~~~~~~~~~~~^
mojo: error: failed to parse the provided Mojo source module
Excercise solution implementation
from gpu import thread_idx, block_idx, warp, barrier
from gpu.host import DeviceContext, DeviceBuffer
from gpu.memory import AddressSpace
from memory import stack_allocation
from layout import Layout, LayoutTensor
from math import iota
from sys import sizeof
def main():
ctx = DeviceContext()
alias dtype_f32 = DType.float32
alias elements_f32 = 32
alias blocks_f32 = 8
alias threads_f32 = elements_f32 // blocks_f32
# Create buffers
var in_buffer_host = ctx.enqueue_create_host_buffer[dtype_f32](elements_f32)
var in_buffer_device = ctx.enqueue_create_buffer[dtype_f32](elements_f32)
var out_buffer_host = ctx.enqueue_create_host_buffer[dtype_f32](blocks_f32)
var out_buffer_device = ctx.enqueue_create_buffer[dtype_f32](blocks_f32)
# Zero output buffer values
ctx.enqueue_memset(out_buffer_device, 0)
ctx.synchronize()
# Fill in input values sequentially and copy to device
iota(in_buffer_host.unsafe_ptr(), elements_f32)
in_buffer_host.enqueue_copy_to(in_buffer_device)
# Create the LayoutTensor
alias LayoutF32 = Layout.row_major(blocks_f32, threads_f32)
alias InputTensorF32 = LayoutTensor[dtype_f32, LayoutF32, MutableAnyOrigin]
var float_tensor = InputTensorF32(in_buffer_device)
fn reduce_sum_f32(
in_tensor: InputTensorF32, output_buffer: DeviceBuffer[dtype_f32]
):
var value = in_tensor.load[1](block_idx.x, thread_idx.x)
value = warp.sum(value)
if thread_idx.x == 0:
output_buffer[block_idx.x] = value
ctx.enqueue_function[reduce_sum_f32](
float_tensor, out_buffer_device, grid_dim=8, block_dim=4
)
out_buffer_device.enqueue_copy_to(out_buffer_host)
ctx.synchronize()
print(out_buffer_host)