Hi, I know it is very new but I was trying out the new CustomOpLibrary
torch interface. However I cannot get it to work with any example. I always get the same error:
File "/.../.venv/lib/python3.12/site-packages/max/graph/type.py", line 805, in to_mlir
self.shape.to_mlir(), self.dtype, self.device.to_mlir()
^^^^^^^^^^^^^^^^^^^^
File "/.../.venv/lib/python3.12/site-packages/max/graph/type.py", line 494, in to_mlir
shape_type = mosh.ShapeType()
^^^^^^^^^^^^^^^^
RuntimeError: No active MLIR context
As suggested on Discord I made a simple reprduceable example to post here. here is my setup. The folder structure is:
pyproject.toml
example.py
- kernels
--- __init__.mojo (empty)
--- kernel.mojo
pyproject.toml:
[project]
name = "example"
version = "0.0.0"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"torch>=2.6.0",
"pillow>=11.2.1, <12",
"modular>=25.4.0.dev2025052105",
]
[tool.uv]
[[tool.uv.index]]
url = "https://dl.modular.com/public/nightly/python/simple/"
example.py
from pathlib import Path
import torch
from max.torch import CustomOpLibrary
TILE_SIZE = 16
# Register Mojo kernels in Torch
mojo_kernels = Path(__file__).parent / "kernels"
op_library = CustomOpLibrary(mojo_kernels)
add_const_kernel = op_library.add_const[
{
"const": 10
}
]
def add_const(x: torch.Tensor) -> torch.Tensor:
result = torch.zeros_like(x)
add_const_kernel(result, x)
return result
if __name__ == "__main__":
x = torch.randn(10).cuda()
print(add_const(x))
kernel.mojo
import compiler
from gpu import thread_idx, block_idx, barrier
from layout import Layout, LayoutTensor, UNKNOWN_VALUE
from runtime.asyncrt import DeviceContextPtr
from math import ceildiv
from tensor import InputTensor, OutputTensor
alias BLOCK_SIZE = 32
alias Dyn1DLayout = Layout.row_major(UNKNOWN_VALUE)
alias dtype = DType.float32
@compiler.register("add_const")
struct AddConst:
@staticmethod
fn execute[
const: Int,
target: StaticString,
](
# Outputs
result: OutputTensor[type = DType.float32, rank=1],
# Inputs
x: InputTensor[type = DType.float32, rank=1],
# Context
ctx: DeviceContextPtr,
) raises:
x_tensor = x.to_layout_tensor()
result_tensor = result.to_layout_tensor()
@parameter
if target == "cpu":
raise Error("Rasterize3DGS CPU target not implemented yet.")
elif target == "gpu":
# Get GPU context
var gpu_ctx = ctx.get_device_context()
# Define grid and block dimensions for the kernel launch
var grid = (ceildiv(x.dim_size(0), BLOCK_SIZE))
var block = (BLOCK_SIZE)
gpu_ctx.enqueue_function[add_const_kernel[const]](
x_tensor,
result_tensor,
grid_dim=grid,
block_dim=block,
)
else:
raise Error("Unsupported target:", target)
fn add_const_kernel[
const: Int
](
x: LayoutTensor[dtype, Dyn1DLayout, MutableAnyOrigin],
result: LayoutTensor[dtype, Dyn1DLayout, MutableAnyOrigin],
):
i = block_idx.x * BLOCK_SIZE + thread_idx.x
result[i] = x[i] + const
I also tried without using the UNKOWN_VALUE but I always get the same issue. Anything I might be doing wrong here?