Skip to content

forgeCol

template <typename T> class forgeCol

Column vector with transparent CPU/GPU memory management.

Wraps a raw pointer with optional Metal GPU dispatch on Apple platforms. Provides element access via at(), scalar arithmetic operators, and conversion to/from Armadillo Col<T> via getArma().

GPU/CPU synchronization semantics:

The isOnGPU flag tracks whether the authoritative copy of the data is on the GPU device. After GPU operations (via Metal), data is on the device. The const overload of getArma() creates a zero-copy arma view without synchronization.

View safety:

Views are non-owning wrappers around external memory. The parent allocation must outlive the view. Calling set_size() on a view breaks the view relationship (allocates fresh memory; the original memory is NOT freed). Use col_copy() for a safe deep copy.

Metal dispatch:

When METAL_COMPUTE is defined and T = float, element-wise operations dispatch to Apple Metal GPU kernels for vectors with n_elem >= 4096. Double precision and small vectors always use the CPU path.

Warning

getArmaComplex() returns a temporary arma view. Never capture the result with auto — the underlying forgeCol may be destroyed before the view is used. Always assign to an explicit type: arma::Col<std::complex<T>> result = expr;

forgeCol<float> v(1024);
v.ones();
forgeCol<float> w = v + v;        // element-wise add (Metal if available)
arma::Col<float> a = w.getArma(); // copy back to Armadillo

T : Element type (e.g., float, double, forgeComplex<float>).

See : forgeMat

See : forgeComplex

Types

Name Description
view_tag Tag type for the private view constructor.

Variables

Name Description
mem Pointer to the underlying contiguous data buffer.
d_mem CUDA device pointer (null when data is on host)
isInitialized True after set_size() has been called (memory is allocated).
isOnGPU True when the authoritative data copy resides on the GPU device.
isView_ True when wrapping external memory (non-owning)

Functions

Name Description
forgeCol Private view constructor: wraps external memory without allocating. The caller is responsible for ensuring the memory outlives this forgeCol.
view Create a non-owning view that wraps external memory.
is_view Returns true if this forgeCol is a non-owning view.
on_gpu Returns true when the authoritative data copy resides on the GPU device.
putOnGPU Upload host data to CUDA device memory.
getFromGPU Download device data to host.
device_memptr Get raw device pointer (nullptr if not on GPU)
memptr Return a raw pointer to the underlying data buffer.

Variable Details

d_mem

T* d_mem

CUDA device pointer (null when data is on host)

isInitialized

bool isInitialized

True after set_size() has been called (memory is allocated).

isOnGPU

bool isOnGPU

True when the authoritative data copy resides on the GPU device.

isView_

bool isView_

True when wrapping external memory (non-owning)

mem

T* mem

Pointer to the underlying contiguous data buffer.

Function Details

device_memptr

T* device_memptr() const

Get raw device pointer (nullptr if not on GPU)

forgeCol

forgeCol(T* extMem, uword length, view_tag)

Private view constructor: wraps external memory without allocating.
The caller is responsible for ensuring the memory outlives this forgeCol.

getFromGPU

void getFromGPU()

Download device data to host. Sets isOnGPU = false.

is_view

bool is_view() const

Returns true if this forgeCol is a non-owning view.

memptr

class forgeCol { private: /// Pointer to the underlying contiguous data buffer. T* mem; #ifdef CUDA_COMPUTE T* d_mem = nullptr; ///< CUDA device pointer (null when data is on host) #endif /// True after set_size() has been called (memory is allocated). bool isInitialized; /// True when the authoritative data copy resides on the GPU device. bool isOnGPU; bool isView_; ///< True when wrapping external memory (non-owning) /// Tag type for the private view constructor. struct view_tag { }; /// Private view constructor: wraps external memory without allocating. /// The caller is responsible for ensuring the memory outlives this forgeCol. forgeCol(T* extMem, uword length, view_tag) : isOnGPU(false) , isInitialized(true) , mem(extMem) , isView_(true) , n_elem(length) { } public: const uword n_elem; /// Create a non-owning view that wraps external memory. /// The returned forgeCol does NOT free the memory on destruction. /// Write operations (e.g., operator%=) modify the external memory in-place. static forgeCol<T> view(T* extMem, uword length) { return forgeCol<T>(extMem, length, view_tag { }); } /// Returns true if this forgeCol is a non-owning view. bool is_view() const { return isView_; } /// Returns true when the authoritative data copy resides on the GPU device. bool on_gpu() const { return isOnGPU; } #ifdef CUDA_COMPUTE /// Upload host data to CUDA device memory. Sets isOnGPU = true. void putOnGPU() { if (isOnGPU || n_elem == 0) return; if (!d_mem) { CUDA_CHECK(cudaMalloc(&d_mem, n_elem * sizeof(T))); } CUDA_CHECK(cudaMemcpy(d_mem, mem, n_elem * sizeof(T), cudaMemcpyHostToDevice)); isOnGPU = true; } /// Download device data to host. Sets isOnGPU = false. void getFromGPU() { if (!isOnGPU || !d_mem || n_elem == 0) return; CUDA_CHECK(cudaMemcpy(mem, d_mem, n_elem * sizeof(T), cudaMemcpyDeviceToHost)); isOnGPU = false; } /// Get raw device pointer (nullptr if not on GPU) T* device_memptr() const { return d_mem; } #endif // Constructors forgeCol<T>() : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { } forgeCol<T>(uword length) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(length); } // Constructor from arma::Col<T> for non-complex types (float, double, etc.) template <typename U = T, typename std::enable_if< !std::is_same<U, forgeComplex<float>>::value && !std::is_same<U, forgeComplex<double>>::value, int>::type = 0> forgeCol(const arma::Col<T>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, cSCplx.memptr(), sizeof(T) * cSCplx.n_elem); } // Constructor from arma::Col<complex<float>> for forgeComplex<float> template <typename U = T, typename std::enable_if<std::is_same<U, forgeComplex<float>>::value, int>::type = 0> forgeCol(const arma::Col<std::complex<float>>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, reinterpret_cast<const float*>(cSCplx.memptr()), sizeof(T) * cSCplx.n_elem); } // Constructor from arma::Col<complex<double>> for forgeComplex<double> template <typename U = T, typename std::enable_if<std::is_same<U, forgeComplex<double>>::value, int>::type = 0> forgeCol(const arma::Col<std::complex<double>>& cSCplx) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(cSCplx.n_elem); memcpy(this->mem, reinterpret_cast<const double*>(cSCplx.memptr()), sizeof(T) * cSCplx.n_elem); } // Copy Constructor — always produces an owning forgeCol (deep copy), even from views. forgeCol<T>(const forgeCol<T>& pgA) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(false) , n_elem(0) { set_size(pgA.n_elem); size_t bytes = sizeof(T) * pgA.n_elem; memcpy(mem, pgA.memptr(), bytes); #ifdef CUDA_COMPUTE if (pgA.isOnGPU && pgA.d_mem) { CUDA_CHECK(cudaMalloc(&d_mem, n_elem * sizeof(T))); CUDA_CHECK(cudaMemcpy(d_mem, pgA.d_mem, n_elem * sizeof(T), cudaMemcpyDeviceToDevice)); isOnGPU = true; } #endif } // Move Constructor — transfers ownership (or view status) from source. forgeCol<T>(forgeCol<T>&& pgA) : isOnGPU(false) , isInitialized(false) , mem(NULL) , isView_(pgA.isView_) , n_elem(0) { access::rw(n_elem) = pgA.n_elem; mem = pgA.memptr(); isInitialized = true; isOnGPU = pgA.isOnGPU; #ifdef CUDA_COMPUTE d_mem = pgA.d_mem; pgA.d_mem = nullptr; #endif pgA.reset_mem(); } // Destructor — views do NOT free memory (non-owning). ~forgeCol<T>() { if (!isView_) { #ifdef CUDA_COMPUTE if (d_mem) { cudaFree(d_mem); d_mem = nullptr; } #endif if (mem != NULL) { #ifdef METAL_COMPUTE std::free(mem); #else delete[] mem; #endif } } } /// Return a raw pointer to the underlying data buffer. T* memptr() const

Return a raw pointer to the underlying data buffer.

on_gpu

bool on_gpu() const

Returns true when the authoritative data copy resides on the GPU device.

putOnGPU

void putOnGPU()

Upload host data to CUDA device memory. Sets isOnGPU = true.

view

static forgeCol<T> view(T* extMem, uword length)

Create a non-owning view that wraps external memory.
The returned forgeCol does NOT free the memory on destruction.
Write operations (e.g., operator%=) modify the external memory in-place.