I’ve been experimenting with integrating Mojo GPU kernels into a C project (Darktable).
My approach has been to move implementation from C to Mojo, compile Mojo code into a lib, and then import it in the original C program and call it.
While at it I came across a segmentation fault that is both easy to cause and easy to avoid.
After some debugging, I think what is happening is that allocations within the body of an exported Mojo func happen in “C world” and inaccessible to Mojo code (kernel[] closure in this case).
Given this problem didn’t come up in How to call mojo in c++, I’m assuming the issue is with closures specifically.
The fix is to put a wrapper func around kernel[] and elementwise.
Please correct me if you know what’s happening.
Def interested in learning more about this.
Code for working and failing examples below (and how to run it):
lib_fault.mojo
from std.gpu.host import DeviceContext
from std.algorithm.functional import elementwise
from std.utils import IndexList
from std.memory.unsafe_pointer import alloc, UnsafePointer
fn internal_gpu_launcher(dctx: DeviceContext, p_src: Int, p_dst: Int) raises:
@parameter
@always_inline
fn kernel[
sw: Int, rank: Int, align: Int
](indices: IndexList[rank]) capturing -> None:
UnsafePointer[Float32, MutAnyOrigin](unsafe_from_address=p_dst)[
0
] = UnsafePointer[Float32, ImmutAnyOrigin](unsafe_from_address=p_src)[0]
elementwise[kernel, 1, target="gpu"](1, dctx)
dctx.synchronize()
@export
fn run_working_case():
try:
print("Mojo: [WORKING] Launching via internal_gpu_launcher...")
var dctx = DeviceContext()
var dev_data = dctx.enqueue_create_buffer[DType.float32](1)
var dev_out = dctx.enqueue_create_buffer[DType.float32](1)
var p_src = Int(dev_data.unsafe_ptr())
var p_dst = Int(dev_out.unsafe_ptr())
internal_gpu_launcher(dctx, p_src, p_dst)
print("Mojo: [WORKING] Success.")
except e:
print("Mojo: [WORKING] Error:", String(e))
@export
fn run_failing_case():
try:
print(
"Mojo: [FAILING] Direct launch from @export (SIGSEGV/Fault"
" expected)..."
)
var dctx = DeviceContext()
var dev_data = dctx.enqueue_create_buffer[DType.float32](1)
var dev_out = dctx.enqueue_create_buffer[DType.float32](1)
var p_src = Int(dev_data.unsafe_ptr())
var p_dst = Int(dev_out.unsafe_ptr())
@parameter
@always_inline
fn kernel[
sw: Int, rank: Int, align: Int
](indices: IndexList[rank]) capturing -> None:
UnsafePointer[Float32, MutAnyOrigin](unsafe_from_address=p_dst)[
0
] = UnsafePointer[Float32, ImmutAnyOrigin](
unsafe_from_address=p_src
)[
0
]
elementwise[kernel, 1, target="gpu"](1, dctx)
dctx.synchronize()
print("Mojo: [FAILING] Success (Unexpected!)")
except e:
print("Mojo: [FAILING] Error:", String(e))
main.c
#include <stdio.h>
#include <dlfcn.h>
typedef void (*launch_fn)();
int main() {
void* handle = dlopen("./lib_fault.so", RTLD_NOW);
if (!handle) {
fprintf(stderr, "dlopen failed: %s\n", dlerror());
return 1;
}
launch_fn run_fail = (launch_fn)dlsym(handle, "run_failing_case");
launch_fn run_work = (launch_fn)dlsym(handle, "run_working_case");
printf("--- Running WORKING Case (Indirection) ---\n");
run_work();
printf("C: WORKING case worked.\n\n");
printf("--- Running FAILING Case (Direct) ---\n");
run_fail();
printf("C: FAILING case worked (Unexpected).\n");
dlclose(handle);
return 0;
}
build.sh
#!/bin/bash
set -e
echo "--- Building Mojo Library ---"
pixi run mojo build lib_fault.mojo --emit shared-lib -o lib_fault.so
echo "--- Building C Driver ---"
clang main.c -Wl,-rpath=. -ldl -o reproduce_crash
./reproduce_crash