Hi all,
I have been working towards using MAX to execute models stored in ONNX format by using max’s engine.load.
I have a working version now, where I can load a model that was trained with Pytorch, exported to ONNX, loaded with the max engine and executed on new inputs, but the CPU and GPU versions of inferencing the same model with the same new sample(s) have different outputs.
The source code below has three functions
- MAX load model, inference session on CPU
- MAX load model, inference session on GPU
- ONNX load model, inference session on CPU
The neural network is one trained for tabular data which has a vector input of (batch size, 2381). (a full overview of the model’s layers is added below as well).
The issue: max-cpu and onnx-cpu prediction results on a new randomly generated sample are identical as you would expect, but the max-gpu result for the same sample is not identical. Rather than being completely different, the first prediction on GPU does match the first prediction on CPU, but the others don’t.
For example: when given 4 new samples to predict the output is:
=== Predictions ===
MAX CPU:
[[1. ]
[1. ]
[1. ]
[0.3713894]]
ONNX CPU:
[[1. ]
[1. ]
[1. ]
[0.3713926]]
MAX GPU:
[[1. ]
[0.31785867]
[0.9791118 ]
[1. ]]
No matter how many new samples are given, the first prediction is always the same on CPU (max and onnx) and on GPU, but the remainder of the gpu predictions is completely different.
My main issue is that I don’t know how to start debugging this. I would like to leverage my gpu, but I have to be able to trust the output.
A secondary question I have is whether this workflow leverages max engine well? I’m not directly using the max.graph or max.nn APIs. I could not find corresponding layers to what’s in my network, nor was it clear how to transfer pretrained weights to the graph if I had found equivalent layers.
The first time I run the max functions, I do notice a delay, so I’m assuming the engine is working its magic, but I would love to have a better understanding of what’s happening. Is there a good way to observe the execution that’s available to MAX users?
Full source code
import numpy as np
from max import driver, engine
from max.dtype import DType
import onnxruntime
import onnx
def run_max_cpu(sample: np.ndarray) -> np.ndarray:
"""
Run inference using MAX CPU backend.
Args:
sample: Input tensor of shape (batch_size, 2381)
Returns:
Predictions as probabilities
"""
device = driver.CPU()
print(f"MAX CPU Device: {device}")
print(f"Device API: {device.api}\n")
session = engine.InferenceSession(num_threads=16, devices=[device])
print(f"Session: {session}")
model = session.load(
model='./bodmas.onnx',
input_specs=[engine.TorchInputSpec(shape=sample.shape, dtype=DType.float32)]
)
# Print input metadata for debugging
print("\nModel Input Metadata:")
for tensor in model.input_metadata:
print(f'name: {tensor.name}, shape: {tensor.shape}, dtype: {tensor.dtype}')
# Run inference
result = model.execute(sample)
predictions = result[0].to_numpy()
probabilities = 1 / (1 + np.exp(-predictions))
return probabilities
def run_max_gpu(sample: np.ndarray) -> np.ndarray:
"""
Run inference using MAX GPU backend.
Args:
sample: Input tensor of shape (batch_size, 2381)
Returns:
Predictions as probabilities
"""
device = driver.Accelerator(id=0)
print(f"MAX GPU Device: {device}")
print(f"Device API: {device.api}\n")
session = engine.InferenceSession(num_threads=16, devices=[device])
print(f"Session: {session}")
model = session.load(
model='./bodmas.onnx',
input_specs=[engine.TorchInputSpec(shape=sample.shape, dtype=DType.float32)]
)
# Print input metadata for debugging
print("\nModel Input Metadata:")
for tensor in model.input_metadata:
print(f'name: {tensor.name}, shape: {tensor.shape}, dtype: {tensor.dtype}')
# Run inference
result = model.execute(sample)
predictions = result[0].to_numpy()
probabilities = 1 / (1 + np.exp(-predictions))
return probabilities
def run_onnx_cpu(sample: np.ndarray) -> np.ndarray:
"""
Run inference using ONNX Runtime CPU backend.
Args:
sample: Input tensor of shape (batch_size, 2381)
Returns:
Predictions as probabilities
"""
# Load and verify model
model = onnx.load('./bodmas.onnx')
print("\nONNX Model Info:")
print(f"Model IR version: {model.ir_version}")
print(f"Producer name: {model.producer_name}")
# Create session
ort_session = onnxruntime.InferenceSession('./bodmas.onnx')
# Print input metadata for debugging
print("\nModel Input Metadata:")
for input in ort_session.get_inputs():
print(f'name: {input.name}, shape: {input.shape}, dtype: {input.type}')
# Run inference
ort_inputs = {'continuous-features': sample}
ort_outs = ort_session.run(None, ort_inputs)
predictions = ort_outs[0]
probabilities = 1 / (1 + np.exp(-predictions))
return probabilities
def compare_predictions(sample: np.ndarray) -> None:
"""
Run inference using all backends and compare results.
Args:
sample: Input tensor of shape (batch_size, 2381)
"""
# Run inference using all backends
print("\n=== Running MAX CPU ===")
max_cpu_preds = run_max_cpu(sample)
print("\n=== Running ONNX CPU ===")
onnx_cpu_preds = run_onnx_cpu(sample)
print("\n=== Running MAX GPU ===")
max_gpu_preds = run_max_gpu(sample)
# Output predictions in specified order
print("\n=== Predictions ===")
print("MAX CPU:")
print(max_cpu_preds)
print("\nONNX CPU:")
print(onnx_cpu_preds)
print("\nMAX GPU:")
print(max_gpu_preds)
if __name__ == "__main__":
# Create test sample
random_sample = np.random.randn(4, 2381).astype(np.float32)
# Run comparison
compare_predictions(random_sample)
ONNX graph export details
Exported graph: graph(%continuous-features : Float(*, 2381, strides=[2381, 1], requires_grad=0, device=cpu),
%bn_cont.weight : Float(2381, strides=[1], requires_grad=1, device=cpu),
%bn_cont.bias : Float(2381, strides=[1], requires_grad=1, device=cpu),
%bn_cont.running_mean : Float(2381, strides=[1], requires_grad=0, device=cpu),
%bn_cont.running_var : Float(2381, strides=[1], requires_grad=0, device=cpu),
%layers.0.2.weight : Float(200, strides=[1], requires_grad=1, device=cpu),
%layers.0.2.bias : Float(200, strides=[1], requires_grad=1, device=cpu),
%layers.0.2.running_mean : Float(200, strides=[1], requires_grad=0, device=cpu),
%layers.0.2.running_var : Float(200, strides=[1], requires_grad=0, device=cpu),
%layers.1.2.weight : Float(100, strides=[1], requires_grad=1, device=cpu),
%layers.1.2.bias : Float(100, strides=[1], requires_grad=1, device=cpu),
%layers.1.2.running_mean : Float(100, strides=[1], requires_grad=0, device=cpu),
%layers.1.2.running_var : Float(100, strides=[1], requires_grad=0, device=cpu),
%layers.2.0.weight : Float(1, 100, strides=[100, 1], requires_grad=1, device=cpu),
%layers.2.0.bias : Float(1, strides=[1], requires_grad=1, device=cpu),
%onnx::MatMul_31 : Float(2381, 200, strides=[1, 2381], requires_grad=0, device=cpu),
%onnx::MatMul_32 : Float(200, 100, strides=[1, 200], requires_grad=0, device=cpu)):
%/bn_cont/BatchNormalization_output_0 : Float(*, 2381, strides=[2381, 1], requires_grad=1, device=cpu) = onnx::BatchNormalization[epsilon=1.0000000000000001e-05, momentum=0.90000000000000002, training_mode=0, onnx_name=“/bn_cont/BatchNormalization”](%continuous-features, %bn_cont.weight, %bn_cont.bias, %bn_cont.running_mean, %bn_cont.running_var), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.batchnorm.BatchNorm1d::bn_cont # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/functional.py:2822:0
%/layers/layers.0/layers.0.0/MatMul_output_0 : Float(*, 200, strides=[200, 1], requires_grad=1, device=cpu) = onnx::MatMul[onnx_name=“/layers/layers.0/layers.0.0/MatMul”](%/bn_cont/BatchNormalization_output_0, %onnx::MatMul_31), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.0/torch.nn.modules.linear.Linear::layers.0.0 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/modules/linear.py:125:0
%/layers/layers.0/layers.0.1/Relu_output_0 : Float(*, 200, strides=[200, 1], requires_grad=1, device=cpu) = onnx::Reluonnx_name=“/layers/layers.0/layers.0.1/Relu”, scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.0/torch.nn.modules.activation.ReLU::layers.0.1 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/functional.py:1702:0
%/layers/layers.0/layers.0.2/BatchNormalization_output_0 : Float(*, 200, strides=[200, 1], requires_grad=1, device=cpu) = onnx::BatchNormalization[epsilon=1.0000000000000001e-05, momentum=0.90000000000000002, training_mode=0, onnx_name=“/layers/layers.0/layers.0.2/BatchNormalization”](%/layers/layers.0/layers.0.1/Relu_output_0, %layers.0.2.weight, %layers.0.2.bias, %layers.0.2.running_mean, %layers.0.2.running_var), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.0/torch.nn.modules.batchnorm.BatchNorm1d::layers.0.2 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/functional.py:2822:0
%/layers/layers.1/layers.1.0/MatMul_output_0 : Float(*, 100, strides=[100, 1], requires_grad=1, device=cpu) = onnx::MatMul[onnx_name=“/layers/layers.1/layers.1.0/MatMul”](%/layers/layers.0/layers.0.2/BatchNormalization_output_0, %onnx::MatMul_32), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.1/torch.nn.modules.linear.Linear::layers.1.0 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/modules/linear.py:125:0
%/layers/layers.1/layers.0.1/Relu_output_0 : Float(*, 100, strides=[100, 1], requires_grad=1, device=cpu) = onnx::Reluonnx_name=“/layers/layers.1/layers.0.1/Relu”, scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.1/torch.nn.modules.activation.ReLU::layers.0.1 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/functional.py:1702:0
%/layers/layers.1/layers.1.2/BatchNormalization_output_0 : Float(*, 100, strides=[100, 1], requires_grad=1, device=cpu) = onnx::BatchNormalization[epsilon=1.0000000000000001e-05, momentum=0.90000000000000002, training_mode=0, onnx_name=“/layers/layers.1/layers.1.2/BatchNormalization”](%/layers/layers.1/layers.0.1/Relu_output_0, %layers.1.2.weight, %layers.1.2.bias, %layers.1.2.running_mean, %layers.1.2.running_var), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.1/torch.nn.modules.batchnorm.BatchNorm1d::layers.1.2 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/functional.py:2822:0
%binary-prediction : Float(*, 1, strides=[1, 1], requires_grad=1, device=cpu) = onnx::Gemm[alpha=1., beta=1., transB=1, onnx_name=“/layers/layers.2/layers.2.0/Gemm”](%/layers/layers.1/layers.1.2/BatchNormalization_output_0, %layers.2.0.weight, %layers.2.0.bias), scope: fastai.tabular.model.TabularModel::/torch.nn.modules.container.Sequential::layers/fastai.layers.LinBnDrop::layers.2/torch.nn.modules.linear.Linear::layers.2.0 # /home/dhoogla/inference/.magic/envs/default/lib/python3.12/site-packages/torch/nn/modules/linear.py:125:0
return (%binary-prediction)