cuda¶
Types¶
| Name | Description |
|---|---|
| CudaNufftPipeline | Full GPU NUFFT pipeline — all intermediates stay on-device. |
Functions¶
| Name | Description |
|---|---|
| cvec_axpy | C = A + alpha * B (complex axpy) |
| cvec_cdot | Complex conjugate dot product: sum(conj(A[i]) * B[i]) |
| cvec_mul_scalar | C = alpha * A (complex scalar multiply) |
| cvec_norm2sq | Squared L2 norm: sum( |
| finite_diff | Finite difference operator: out[i] = x[i] - x[i-offset] for i >= offset, 0 otherwise Interleaved complex: buffers are 2*n floats for n complex elements |
| finite_diff_adj | Adjoint finite difference: out[i] = d[i] - d[i+offset] for i < n-offset, boundary handled |
| get_cublas_handle | Get or create process-wide cuBLAS handle (auto-sets stream) |
| get_stream | Get the active CUDA stream (auto-initializes if needed) |
| gridding_adjoint_2D | 2D adjoint gridding: scatter k-space samples onto oversampled grid (atomicAdd) |
| gridding_adjoint_3D | 3D adjoint gridding |
| gridding_forward_2D | 2D forward gridding: gather from grid to k-space samples |
| gridding_forward_3D | 3D forward gridding |
| init | Initialize CUDA device (call once at startup, or auto-inits on first use) |
| print_device_info | Print device info to stdout |
| rvec_cmul | Real vector * complex vector element-wise: C[i] = W[i] * X[i] |
| vec_sum | Real sum: sum(A[i]) |
Function Details¶
cvec_axpy¶
void cvec_axpy(const float* A, const float* B, float* C, float alphaRe, float alphaIm, size_t n, cudaStream_t stream)
C = A + alpha * B (complex axpy)
cvec_cdot¶
void cvec_cdot(const float* A, const float* B, float* outRe, float* outIm, size_t n, cudaStream_t stream)
Complex conjugate dot product: sum(conj(A[i]) * B[i])
cvec_mul_scalar¶
void cvec_mul_scalar(const float* A, float alphaRe, float alphaIm, float* C, size_t n, cudaStream_t stream)
C = alpha * A (complex scalar multiply)
cvec_norm2sq¶
float cvec_norm2sq(const float* A, size_t n, cudaStream_t stream)
Squared L2 norm: sum(|A[i]|^2)
finite_diff¶
void finite_diff(const float* x, float* out, size_t n, size_t offset, cudaStream_t stream)
Finite difference operator: out[i] = x[i] - x[i-offset] for i >= offset, 0 otherwise Interleaved complex: buffers are 2*n floats for n complex elements
finite_diff_adj¶
void finite_diff_adj(const float* d, float* out, size_t n, size_t offset, cudaStream_t stream)
Adjoint finite difference: out[i] = d[i] - d[i+offset] for i < n-offset, boundary handled
get_cublas_handle¶
cublasHandle_t get_cublas_handle()
Get or create process-wide cuBLAS handle (auto-sets stream)
get_stream¶
cudaStream_t get_stream()
Get the active CUDA stream (auto-initializes if needed)
gridding_adjoint_2D¶
void gridding_adjoint_2D(const Sample* d_samples, int n, const float* d_LUT, int sizeLUT, float* d_gridData, const GridParams& params, cudaStream_t stream)
2D adjoint gridding: scatter k-space samples onto oversampled grid (atomicAdd)
gridding_adjoint_3D¶
void gridding_adjoint_3D(const Sample* d_samples, int n, const float* d_LUT, int sizeLUT, float* d_gridData, const GridParams& params, cudaStream_t stream)
3D adjoint gridding
gridding_forward_2D¶
void gridding_forward_2D(const float* d_gridData, int n, const float* d_kx, const float* d_ky, const float* d_LUT, int sizeLUT, float* d_samplesOut, const GridParams& params, cudaStream_t stream)
2D forward gridding: gather from grid to k-space samples
gridding_forward_3D¶
void gridding_forward_3D(const float* d_gridData, int n, const float* d_kx, const float* d_ky, const float* d_kz, const float* d_LUT, int sizeLUT, float* d_samplesOut, const GridParams& params, cudaStream_t stream)
3D forward gridding
init¶
void init(int device = 0)
Initialize CUDA device (call once at startup, or auto-inits on first use)
print_device_info¶
void print_device_info()
Print device info to stdout
rvec_cmul¶
void rvec_cmul(const float* W, const float* X, float* C, size_t n, cudaStream_t stream)
Real vector * complex vector element-wise: C[i] = W[i] * X[i]
vec_sum¶
float vec_sum(const float* A, size_t n, cudaStream_t stream)
Real sum: sum(A[i])