forgeCol¶
template <typename T> class forgeCol
Column vector with transparent CPU/GPU memory management.
Wraps a raw pointer with optional Metal GPU dispatch on Apple platforms.
Provides element access via at(), scalar arithmetic operators, and
conversion to/from Armadillo Col<T> via getArma().
GPU/CPU synchronization semantics:
The isOnGPU flag tracks whether the authoritative copy of the data is
on the GPU device. After GPU operations (via Metal), data is on the
device. The const overload of getArma() creates a zero-copy arma view
without synchronization.
View safety:
Views are non-owning wrappers around external memory. The parent
allocation must outlive the view. Calling set_size() on a view
breaks the view relationship (allocates fresh memory; the original
memory is NOT freed). Use col_copy() for a safe deep copy.
Metal dispatch:
When METAL_COMPUTE is defined and T = float, element-wise operations
dispatch to Apple Metal GPU kernels for vectors with n_elem >= 4096.
Double precision and small vectors always use the CPU path.
Warning
getArmaComplex() returns a temporary arma view. Never capture
the result with auto — the underlying forgeCol may be destroyed before
the view is used. Always assign to an explicit type:
arma::Col<std::complex<T>> result = expr;
forgeCol<float> v(1024);
v.ones();
forgeCol<float> w = v + v; // element-wise add (Metal if available)
arma::Col<float> a = w.getArma(); // copy back to Armadillo
T
: Element type (e.g., float, double, forgeComplex<float>).
See : forgeMat
See : forgeComplex
Types¶
| Name | Description |
|---|---|
| view_tag | Tag type for the private view constructor. |
Variables¶
| Name | Description |
|---|---|
| mem | Pointer to the underlying contiguous data buffer. |
| d_mem | CUDA device pointer (null when data is on host) |
| isPinned_ | True when mem was allocated with cudaMallocHost |
| isInitialized | True after set_size() has been called (memory is allocated). |
| isOnGPU | True when the authoritative data copy resides on the GPU device. |
| isView_ | True when wrapping external memory (non-owning) |
Functions¶
| Name | Description |
|---|---|
| forgeCol | Private view constructor: wraps external memory without allocating. The caller is responsible for ensuring the memory outlives this forgeCol. |
| view | Create a non-owning view that wraps external memory. |
| is_view | Returns true if this forgeCol is a non-owning view. |
| on_gpu | Returns true when the authoritative data copy resides on the GPU device. |
| putOnGPU | Upload host data to CUDA device memory. |
| allocateOnGPU | Allocate device memory without uploading host data. |
| getFromGPU | Download device data to host. |
| zerosOnGPU | Allocate device memory, set to zero, mark as on GPU. |
| device_memptr | Get raw device pointer (nullptr if not on GPU) |
| freeHostMem | Free host memory using the correct allocator (pinned vs regular). |
| memptr | Return a raw pointer to the underlying data buffer. |
Variable Details¶
d_mem¶
T* d_mem
CUDA device pointer (null when data is on host)
isInitialized¶
bool isInitialized
True after set_size() has been called (memory is allocated).
isOnGPU¶
bool isOnGPU
True when the authoritative data copy resides on the GPU device.
isPinned_¶
bool isPinned_
True when mem was allocated with cudaMallocHost
isView_¶
bool isView_
True when wrapping external memory (non-owning)
mem¶
T* mem
Pointer to the underlying contiguous data buffer.
Function Details¶
allocateOnGPU¶
void allocateOnGPU()
Allocate device memory without uploading host data.
Use when device data will be immediately overwritten (e.g. kernel output buffers).
device_memptr¶
T* device_memptr() const
Get raw device pointer (nullptr if not on GPU)
forgeCol¶
forgeCol(T* extMem, uword length, view_tag)
Private view constructor: wraps external memory without allocating.
The caller is responsible for ensuring the memory outlives this forgeCol.
freeHostMem¶
void freeHostMem()
Free host memory using the correct allocator (pinned vs regular).
getFromGPU¶
void getFromGPU()
Download device data to host. Sets isOnGPU = false.
is_view¶
bool is_view() const
Returns true if this forgeCol is a non-owning view.
memptr¶
class forgeCol { private: /// Pointer to the underlying contiguous data buffer. T* mem; #ifdef CUDA_COMPUTE T* d_mem = nullptr; ///< CUDA device pointer (null when data is on host) bool isPinned_ = false; ///< True when mem was allocated with cudaMallocHost #endif /// True after set_size() has been called (memory is allocated). bool isInitialized; /// True when the authoritative data copy resides on the GPU device. bool isOnGPU; bool isView_; ///< True when wrapping external memory (non-owning) /// Tag type for the private view constructor. struct view_tag { }; /// Private view constructor: wraps external memory without allocating. /// The caller is responsible for ensuring the memory outlives this forgeCol. forgeCol(T* extMem, uword length, view_tag) : isOnGPU(false) , isInitialized(true) , mem(extMem) , isView_(true) , n_elem(length) { } public: const uword n_elem; /// Create a non-owning view that wraps external memory. /// The returned forgeCol does NOT free the memory on destruction. /// Write operations (e.g., operator%=) modify the external memory in-place. static forgeCol<T> view(T* extMem, uword length) { return forgeCol<T>(extMem, length, view_tag { }); } /// Returns true if this forgeCol is a non-owning view. bool is_view() const { return isView_; } /// Returns true when the authoritative data copy resides on the GPU device. bool on_gpu() const { return isOnGPU; } #ifdef CUDA_COMPUTE /// Upload host data to CUDA device memory. Sets isOnGPU = true. void putOnGPU() { if (isOnGPU || n_elem == 0) return; if (!d_mem) { CUDA_CHECK(cudaMallocAsync(&d_mem, n_elem * sizeof(T), forge::cuda::get_stream())); } CUDA_CHECK(cudaMemcpy(d_mem, mem, n_elem * sizeof(T), cudaMemcpyHostToDevice)); isOnGPU = true; } /// Allocate device memory without uploading host data. /// Use when device data will be immediately overwritten (e.g. kernel output buffers). void allocateOnGPU() { if (isOnGPU || n_elem == 0) return; if (!d_mem) { CUDA_CHECK(cudaMallocAsync(&d_mem, n_elem * sizeof(T), forge::cuda::get_stream())); } isOnGPU = true; } /// Download device data to host. Sets isOnGPU = false. void getFromGPU() { if (!isOnGPU || !d_mem || n_elem == 0) return; CUDA_CHECK(cudaMemcpy(mem, d_mem, n_elem * sizeof(T), cudaMemcpyDeviceToHost)); isOnGPU = false; } /// Allocate device memory, set to zero, mark as on GPU. /// Equivalent to zeros() + putOnGPU() but avoids the H2D upload. void zerosOnGPU() { if (n_elem == 0) return; if (!d_mem) { CUDA_CHECK(cudaMallocAsync(&d_mem, n_elem * sizeof(T), forge::cuda::get_stream())); } CUDA_CHECK(cudaMemsetAsync(d_mem, 0, n_elem * sizeof(T), forge::cuda::get_stream())); isOnGPU = true; } /// Get raw device pointer (nullptr if not on GPU) T* device_memptr() const { return d_mem; } /// Free host memory using the correct allocator (pinned vs regular). void freeHostMem() { if (mem == NULL) return; if (isPinned_) { cudaFreeHost(mem); isPinned_ = false; } else { #ifdef METAL_COMPUTE std::free(mem); #else delete[] mem; #endif } mem = NULL; } #endif // Constructors forgeCol<T>() : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { } forgeCol<T>(uword length) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(length); } // Constructor from arma::Col<T> for non-complex types (float, double, etc.) template <typename U = T, typename std::enable_if< !std::is_same<U, forgeComplex<float>>::value && !std::is_same<U, forgeComplex<double>>::value, int>::type = 0> forgeCol(const arma::Col<T>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, cSCplx.memptr(), sizeof(T) * cSCplx.n_elem); } // Constructor from arma::Col<complex<float>> for forgeComplex<float> template <typename U = T, typename std::enable_if<std::is_same<U, forgeComplex<float>>::value, int>::type = 0> forgeCol(const arma::Col<std::complex<float>>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, reinterpret_cast<const float*>(cSCplx.memptr()), sizeof(T) * cSCplx.n_elem); } // Constructor from arma::Col<complex<double>> for forgeComplex<double> template <typename U = T, typename std::enable_if<std::is_same<U, forgeComplex<double>>::value, int>::type = 0> forgeCol(const arma::Col<std::complex<double>>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, reinterpret_cast<const double*>(cSCplx.memptr()), sizeof(T) * cSCplx.n_elem); } // Copy Constructor — always produces an owning forgeCol (deep copy), even from views. forgeCol<T>(const forgeCol<T>& pgA) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(pgA.n_elem); #ifdef CUDA_COMPUTE if (pgA.isOnGPU && pgA.d_mem) { // Source is on GPU — only copy device data. Host buffer is allocated // but left uninitialized (getFromGPU() will populate it on demand). CUDA_CHECK(cudaMallocAsync(&d_mem, n_elem * sizeof(T), forge::cuda::get_stream())); CUDA_CHECK(cudaMemcpyAsync(d_mem, pgA.d_mem, n_elem * sizeof(T), cudaMemcpyDeviceToDevice, forge::cuda::get_stream())); isOnGPU = true; } else { memcpy(mem, pgA.memptr(), sizeof(T) * pgA.n_elem); } #else memcpy(mem, pgA.memptr(), sizeof(T) * pgA.n_elem); #endif } // Move Constructor — transfers ownership (or view status) from source. forgeCol<T>(forgeCol<T>&& pgA) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(pgA.isView_) , n_elem(0) { access::rw(n_elem) = pgA.n_elem; mem = pgA.memptr(); isInitialized = true; isOnGPU = pgA.isOnGPU; #ifdef CUDA_COMPUTE d_mem = pgA.d_mem; isPinned_ = pgA.isPinned_; pgA.d_mem = nullptr; pgA.isPinned_ = false; #endif pgA.reset_mem(); } // Destructor — views do NOT free memory (non-owning). ~forgeCol<T>() { if (!isView_) { #ifdef CUDA_COMPUTE if (d_mem) { cudaFreeAsync(d_mem, forge::cuda::get_stream()); d_mem = nullptr; } freeHostMem(); #else if (mem != NULL) { #ifdef METAL_COMPUTE std::free(mem); #else delete[] mem; #endif } #endif } } /// Return a raw pointer to the underlying data buffer. T* memptr() const
Return a raw pointer to the underlying data buffer.
on_gpu¶
bool on_gpu() const
Returns true when the authoritative data copy resides on the GPU device.
putOnGPU¶
void putOnGPU()
Upload host data to CUDA device memory. Sets isOnGPU = true.
view¶
static forgeCol<T> view(T* extMem, uword length)
Create a non-owning view that wraps external memory.
The returned forgeCol does NOT free the memory on destruction.
Write operations (e.g., operator%=) modify the external memory in-place.
zerosOnGPU¶
void zerosOnGPU()
Allocate device memory, set to zero, mark as on GPU.
Equivalent to zeros() + putOnGPU() but avoids the H2D upload.