I’m looking for an equivalent of:
__global__ void nestedHelloWorld(int const iSize, int iDepth) {
int tid = threadIdx.x;
printf("Recursion=%d :Hello World from thread %d block %d\n",
iDepth, tid,blockIdx.x);
if (iSize == 1) return;
// Decrease the number of threads by the power of 2 (rshift)
int nthreads = iSize >> 1;
if (tid == 0 && nthreads > 0 ) {
// Dynamic parallelism - requires -rdc=true for nvcc compilation
// clangd doesn't support CUDA dynamic parallelism checking
nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth); // clangd-ignore
printf("--------> nested execution depth: %d\n",iDepth);
}
}
Its unclear to me whether mojo supports directly via the gpu module or if this is only possible via jumping to max..