Skip to content

cuda

Types

Name Description
CudaNufftPipeline Full GPU NUFFT pipeline — all intermediates stay on-device.

Functions

Name Description
cvec_axpy C = A + alpha * B (complex axpy)
cvec_cdot Complex conjugate dot product: sum(conj(A[i]) * B[i])
cvec_mul_scalar C = alpha * A (complex scalar multiply)
cvec_norm2sq Squared L2 norm: sum(
finite_diff Finite difference operator: out[i] = x[i] - x[i-offset] for i >= offset, 0 otherwise Interleaved complex: buffers are 2*n floats for n complex elements
finite_diff_adj Adjoint finite difference: out[i] = d[i] - d[i+offset] for i < n-offset, boundary handled
get_cublas_handle Get or create process-wide cuBLAS handle (auto-sets stream)
get_stream Get the active CUDA stream (auto-initializes if needed)
gridding_adjoint_2D 2D adjoint gridding: scatter k-space samples onto oversampled grid (atomicAdd)
gridding_adjoint_3D 3D adjoint gridding
gridding_forward_2D 2D forward gridding: gather from grid to k-space samples
gridding_forward_3D 3D forward gridding
init Initialize CUDA device (call once at startup, or auto-inits on first use)
print_device_info Print device info to stdout
rvec_cmul Real vector * complex vector element-wise: C[i] = W[i] * X[i]
vec_sum Real sum: sum(A[i])

Function Details

cvec_axpy

void cvec_axpy(const float* A, const float* B, float* C, float alphaRe, float alphaIm, size_t n, cudaStream_t stream)

C = A + alpha * B (complex axpy)

cvec_cdot

void cvec_cdot(const float* A, const float* B, float* outRe, float* outIm, size_t n, cudaStream_t stream)

Complex conjugate dot product: sum(conj(A[i]) * B[i])

cvec_mul_scalar

void cvec_mul_scalar(const float* A, float alphaRe, float alphaIm, float* C, size_t n, cudaStream_t stream)

C = alpha * A (complex scalar multiply)

cvec_norm2sq

float cvec_norm2sq(const float* A, size_t n, cudaStream_t stream)

Squared L2 norm: sum(|A[i]|^2)

finite_diff

void finite_diff(const float* x, float* out, size_t n, size_t offset, cudaStream_t stream)

Finite difference operator: out[i] = x[i] - x[i-offset] for i >= offset, 0 otherwise Interleaved complex: buffers are 2*n floats for n complex elements

finite_diff_adj

void finite_diff_adj(const float* d, float* out, size_t n, size_t offset, cudaStream_t stream)

Adjoint finite difference: out[i] = d[i] - d[i+offset] for i < n-offset, boundary handled

get_cublas_handle

cublasHandle_t get_cublas_handle()

Get or create process-wide cuBLAS handle (auto-sets stream)

get_stream

cudaStream_t get_stream()

Get the active CUDA stream (auto-initializes if needed)

gridding_adjoint_2D

void gridding_adjoint_2D(const Sample* d_samples, int n, const float* d_LUT, int sizeLUT, float* d_gridData, const GridParams& params, cudaStream_t stream)

2D adjoint gridding: scatter k-space samples onto oversampled grid (atomicAdd)

gridding_adjoint_3D

void gridding_adjoint_3D(const Sample* d_samples, int n, const float* d_LUT, int sizeLUT, float* d_gridData, const GridParams& params, cudaStream_t stream)

3D adjoint gridding

gridding_forward_2D

void gridding_forward_2D(const float* d_gridData, int n, const float* d_kx, const float* d_ky, const float* d_LUT, int sizeLUT, float* d_samplesOut, const GridParams& params, cudaStream_t stream)

2D forward gridding: gather from grid to k-space samples

gridding_forward_3D

void gridding_forward_3D(const float* d_gridData, int n, const float* d_kx, const float* d_ky, const float* d_kz, const float* d_LUT, int sizeLUT, float* d_samplesOut, const GridParams& params, cudaStream_t stream)

3D forward gridding

init

void init(int device = 0)

Initialize CUDA device (call once at startup, or auto-inits on first use)

void print_device_info()

Print device info to stdout

rvec_cmul

void rvec_cmul(const float* W, const float* X, float* C, size_t n, cudaStream_t stream)

Real vector * complex vector element-wise: C[i] = W[i] * X[i]

vec_sum

float vec_sum(const float* A, size_t n, cudaStream_t stream)

Real sum: sum(A[i])