C -> Mojo interop: segfault when using elementwise (and how to resolve it)

I’ve been experimenting with integrating Mojo GPU kernels into a C project (Darktable).
My approach has been to move implementation from C to Mojo, compile Mojo code into a lib, and then import it in the original C program and call it.

While at it I came across a segmentation fault that is both easy to cause and easy to avoid.
After some debugging, I think what is happening is that allocations within the body of an exported Mojo func happen in “C world” and inaccessible to Mojo code (kernel[] closure in this case).
Given this problem didn’t come up in How to call mojo in c++, I’m assuming the issue is with closures specifically.
The fix is to put a wrapper func around kernel[] and elementwise.

Please correct me if you know what’s happening.
Def interested in learning more about this.

Code for working and failing examples below (and how to run it):

lib_fault.mojo
from std.gpu.host import DeviceContext
from std.algorithm.functional import elementwise
from std.utils import IndexList
from std.memory.unsafe_pointer import alloc, UnsafePointer


fn internal_gpu_launcher(dctx: DeviceContext, p_src: Int, p_dst: Int) raises:
    @parameter
    @always_inline
    fn kernel[
        sw: Int, rank: Int, align: Int
    ](indices: IndexList[rank]) capturing -> None:
        UnsafePointer[Float32, MutAnyOrigin](unsafe_from_address=p_dst)[
            0
        ] = UnsafePointer[Float32, ImmutAnyOrigin](unsafe_from_address=p_src)[0]

    elementwise[kernel, 1, target="gpu"](1, dctx)
    dctx.synchronize()


@export
fn run_working_case():
    try:
        print("Mojo: [WORKING] Launching via internal_gpu_launcher...")
        var dctx = DeviceContext()
        var dev_data = dctx.enqueue_create_buffer[DType.float32](1)
        var dev_out = dctx.enqueue_create_buffer[DType.float32](1)

        var p_src = Int(dev_data.unsafe_ptr())
        var p_dst = Int(dev_out.unsafe_ptr())

        internal_gpu_launcher(dctx, p_src, p_dst)
        print("Mojo: [WORKING] Success.")
    except e:
        print("Mojo: [WORKING] Error:", String(e))


@export
fn run_failing_case():
    try:
        print(
            "Mojo: [FAILING] Direct launch from @export (SIGSEGV/Fault"
            " expected)..."
        )
        var dctx = DeviceContext()
        var dev_data = dctx.enqueue_create_buffer[DType.float32](1)
        var dev_out = dctx.enqueue_create_buffer[DType.float32](1)

        var p_src = Int(dev_data.unsafe_ptr())
        var p_dst = Int(dev_out.unsafe_ptr())

        @parameter
        @always_inline
        fn kernel[
            sw: Int, rank: Int, align: Int
        ](indices: IndexList[rank]) capturing -> None:
            UnsafePointer[Float32, MutAnyOrigin](unsafe_from_address=p_dst)[
                0
            ] = UnsafePointer[Float32, ImmutAnyOrigin](
                unsafe_from_address=p_src
            )[
                0
            ]

        elementwise[kernel, 1, target="gpu"](1, dctx)
        dctx.synchronize()
        print("Mojo: [FAILING] Success (Unexpected!)")
    except e:
        print("Mojo: [FAILING] Error:", String(e))


main.c
#include <stdio.h>
#include <dlfcn.h>

typedef void (*launch_fn)();

int main() {
    void* handle = dlopen("./lib_fault.so", RTLD_NOW);
    if (!handle) {
        fprintf(stderr, "dlopen failed: %s\n", dlerror());
        return 1;
    }

    launch_fn run_fail = (launch_fn)dlsym(handle, "run_failing_case");
    launch_fn run_work = (launch_fn)dlsym(handle, "run_working_case");

    printf("--- Running WORKING Case (Indirection) ---\n");
    run_work();
    printf("C: WORKING case worked.\n\n");

    printf("--- Running FAILING Case (Direct) ---\n");
    run_fail();
    printf("C: FAILING case worked (Unexpected).\n");

    dlclose(handle);
    return 0;
}
build.sh
#!/bin/bash
set -e

echo "--- Building Mojo Library ---"
pixi run mojo build lib_fault.mojo --emit shared-lib -o lib_fault.so

echo "--- Building C Driver ---"
clang main.c -Wl,-rpath=. -ldl -o reproduce_crash

./reproduce_crash