2024-06-11 QIS/QCS Simulator in Nvidia CUDA, p.2 (en_US)
Working on a quantum universal gate simulator for educational purposes, non-production ready, designed to run on Nvidia hardware via CUDA.
Nvidia also has the CUDA-Q project, something I’m excited to dig into, but I’m more interested in practicing CUDA, quantum computing, & linear algebra from a more raw perspective.
Source code will be developing on Github, with some of the process documented on this blog.
Said “Q Coo”, like “cuckoo” (will create a cute bird logo later). Is a POC, non-production, quantum universal gate simulator designed to run on Nvidia hardward via CUDA. Mainly to assist in educating myself on quantum computer simulation.
Why Nvidia?
I saw Nvidia surpassed Apple in market cap?! Maybe not a bad time to do linear algebra on Nvidia hardware.
I also really have a ♥ affair with my GPU (RTX 3070), which I aquired during the pandemic & supply chain crisis, from a scalper, to whom I drove 2 hours, & paid way too much money (€1500).
Matrix / vector multiplication (y = A * x)
Made some progress yesterday with the following.
#include <iostream>
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
// CUDA kernel for matrix-vector multiplication
__global__ void matVecMulKernel(float *A, float *x, float *y, int rows, int cols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < rows) {
float sum = 0.0f;
for (int col = 0; col < cols; ++col) {
sum += A[row * cols + col] * x[col];
}
y[row] = sum;
}
}
void matVecMul(float *h_A, float *h_x, float *h_y, int rows, int cols) {
float *d_A, *d_x, *d_y;
// Allocate memory on the device
cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
cudaMalloc((void**)&d_x, cols * sizeof(float));
cudaMalloc((void**)&d_y, rows * sizeof(float));
// Copy data from host to device
cudaMemcpy(d_A, h_A, rows * cols * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, cols * sizeof(float), cudaMemcpyHostToDevice);
// Define the grid and block dimensions
dim3 dimBlock(1, BLOCK_SIZE);
dim3 dimGrid(1, (rows + BLOCK_SIZE - 1) / BLOCK_SIZE);
// Launch the kernel
matVecMulKernel<<<dimGrid, dimBlock>>>(d_A, d_x, d_y, rows, cols);
// Copy the result back to the host
cudaMemcpy(h_y, d_y, rows * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_A);
cudaFree(d_x);
cudaFree(d_y);
}
int main() {
int rows = 4;
int cols = 4;
// Host input matrices
float h_A[] = {
1, 0, 1, 0,
0, 1, 0, 1,
1, 0, -1, 0,
0, 1, 0, -1};
float h_x[] = {1, 0, 0, 0};
float h_y[4];
// Matrix-vector multiplication
matVecMul(h_A, h_x, h_y, rows, cols);
// Print the result
for (int i = 0; i < rows; ++i) {
std::cout << h_y[i] << " ";
}
std::cout << std::endl;
return 0;
}
Tensor Product (A ⊗ B)
#include <cuda_runtime.h>
#include <iostream>
__global__ void tensorProductKernel(float* d_A, float* d_B, float* d_C, int aRows, int aCols, int bRows, int bCols) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int totalRows = aRows * bRows;
int totalCols = aCols * bCols;
if (i < totalRows && j < totalCols) {
int rowA = i / bRows;
int colA = j / bCols;
int rowB = i % bRows;
int colB = j % bCols;
d_C[i * totalCols + j] = d_A[rowA * aCols + colA] * d_B[rowB * bCols + colB];
}
}
void tensorProduct(float* h_A, float* h_B, float* h_C, int aRows, int aCols, int bRows, int bCols) {
int aSize = aRows * aCols * sizeof(float);
int bSize = bRows * bCols * sizeof(float);
int cSize = aRows * bRows * aCols * bCols * sizeof(float);
float* d_A;
float* d_B;
float* d_C;
cudaMalloc((void**)&d_A, aSize);
cudaMalloc((void**)&d_B, bSize);
cudaMalloc((void**)&d_C, cSize);
cudaMemcpy(d_A, h_A, aSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, bSize, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((aCols * bCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
(aRows * bRows + threadsPerBlock.y - 1) / threadsPerBlock.y);
tensorProductKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, aRows, aCols, bRows, bCols);
cudaMemcpy(h_C, d_C, cSize, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
int main() {
const int aRows = 2;
const int aCols = 2;
const int bRows = 2;
const int bCols = 2;
float h_A[aRows * aCols] = {1, 1, 1, -1};
float h_B[bRows * bCols] = {1, 0, 0, 1};
float h_C[aRows * bRows * aCols * bCols];
tensorProduct(h_A, h_B, h_C, aRows, aCols, bRows, bCols);
for (int i = 0; i < aRows * bRows; ++i) {
for (int j = 0; j < aCols * bCols; ++j) {
std::cout << h_C[i * aCols * bCols + j] << " ";
}
std::cout << "\n";
}
return 0;
}
Checkpoint notes
Neither of these are handling complex numbers, and I would like to pull them into more of a stdlib with a header, and change the prints to “tests”.
Source control
Creating a Github repository for Qcu.
Creating a ed25519 ssh key on Windows
ssh-keygen -t ed25519
Reminder: Windows
version of cat
is type
type .ssh\id_ed25519.pub
Creating a markdown → html pipeline
Ideally I would be able to convert these notes to HTML for my blog.
Created a simple script at Github, but is basically just Pandoc
#!/bin/bash
pandoc -o html/$1.html md/$1.md
cat html/header.html html/$1.html html/footer.html > html/tmp.html && mv html/tmp.html html/$1.html
Measuring quantum state vectors
Something is wrong with how I’m generating random numbers with CUDA. Will need to visit this later.
Maybe also with normalization? Not sure CUDA is handling the imaginary part correctly. Will also check this.
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <cuComplex.h>
#include <iostream>
#include <cmath>
// Error checking macro
#define cudaCheckError() { \
cudaError_t e=cudaGetLastError(); \
if(e!=cudaSuccess) { \
std::cerr << "Cuda failure " << __FILE__ << ":" << __LINE__; \
std::cerr << " '" << cudaGetErrorString(e) << "'\n"; \
exit(EXIT_FAILURE); \
} \
}
__global__ void setup_kernel(curandState *state, unsigned long long seed)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void generate_random_numbers(curandState *state, float *randomNumbers)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Generate a random number using the curand_uniform function
float randNum = curand_uniform(&state[idx]);
// Store the random number in the output array
randomNumbers[idx] = randNum;
}
// Kernel to normalize the statevector
__global__ void normalize(cuFloatComplex* statevector, int len) {
__shared__ float norm;
if (threadIdx.x == 0) norm = 0.0f;
__syncthreads();
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < len) {
atomicAdd(&norm, cuCabsf(statevector[idx]) * cuCabsf(statevector[idx]));
}
__syncthreads();
if (threadIdx.x == 0) norm = sqrtf(norm);
__syncthreads();
if (idx < len) {
statevector[idx] = make_cuFloatComplex(cuCrealf(statevector[idx]) / norm, cuCimagf(statevector[idx]) / norm);
}
}
// Kernel to compute the probabilities
__global__ void compute_probabilities(float* probabilities, cuFloatComplex* statevector, int len) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < len) {
probabilities[idx] = cuCabsf(statevector[idx]) * cuCabsf(statevector[idx]);
}
}
// Kernel to measure the statevector
__global__ void measure(float *randomNumbers, int* result, float* probabilities, int len, unsigned long long seed) {
//int idx = blockIdx.x * blockDim.x + threadIdx.x;
//curandState state;
//curand_init(seed + idx, 0, 0, &state);
//float random_number = curand_uniform(&state);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// // Generate a random number using the curand_uniform function
float random_number = randomNumbers[idx];
float cumulative_probability = 0.0f;
for (int i = 0; i < len; ++i) {
cumulative_probability += probabilities[i];
if (random_number < cumulative_probability) {
*result = i;
break;
}
}
}
void checkNormalization(cuFloatComplex* statevector, int len) {
float norm = 0.0;
for (int i = 0; i < len; ++i) {
norm += cuCabsf(statevector[i]) * cuCabsf(statevector[i]);
}
if (fabs(norm - 1.0) > 1e-6) {
std::cerr << "Statevector must be normalized\n";
exit(EXIT_FAILURE);
}
}
int binaryStringToInt(const std::string& binaryStr) {
int result = 0;
for (char bit : binaryStr) {
result <<= 1; // shift left by 1 bit
result += (bit - '0'); // add the current bit
}
return result;
}
int test_measure() {
int num_qubits = 2;
int len = 1 << num_qubits;
cuFloatComplex h_statevector[] = {
make_cuFloatComplex(0.2, 0.0), make_cuFloatComplex(0.2, 0.0),
make_cuFloatComplex(0.6, 0.0), make_cuFloatComplex(0.2, 0.0)
};
// Calculate the norm of the statevector
float norm = 0.0f;
for (int i = 0; i < len; ++i) {
float real_part = cuCrealf(h_statevector[i]);
float imag_part = cuCimagf(h_statevector[i]);
norm += real_part * real_part + imag_part * imag_part;
}
norm = sqrtf(norm);
// Normalize the statevector
for (int i = 0; i < len; ++i) {
float real_part = cuCrealf(h_statevector[i]);
float imag_part = cuCimagf(h_statevector[i]);
h_statevector[i] = make_cuFloatComplex(real_part / norm, imag_part / norm);
}
// Verify normalization
float sum = 0.0f;
for (int i = 0; i < len; ++i) {
float mag = cuCabsf(h_statevector[i]);
sum += mag * mag;
}
// Device memory allocations
cuFloatComplex* d_statevector;
float* d_probabilities;
int* d_result;
// Allocate memory for random number generator states
curandState *d_states;
cudaMalloc(&d_statevector, len * sizeof(cuFloatComplex));
cudaMalloc(&d_probabilities, len * sizeof(float));
cudaMalloc(&d_result, sizeof(int));
cudaMalloc((void **)&d_states, len * sizeof(curandState));
// Copy statevector to device
cudaMemcpy(d_statevector, h_statevector, len * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
// Kernel launches
int blockSize = 256;
int gridSize = (len + blockSize - 1) / blockSize;
// Generate a random seed
unsigned long long seed = time(NULL);
// Setup the kernel with random states
setup_kernel<<<gridSize, blockSize>>>(d_states, seed);
cudaCheckError();
// Allocate memory for the random numbers generated
float *randomNumbers;
cudaMalloc((void **)&randomNumbers, len * sizeof(float));
// Generate random numbers using the initialized states
generate_random_numbers<<<gridSize, blockSize>>>(d_states, randomNumbers);
cudaCheckError();
normalize<<<gridSize, blockSize>>>(d_statevector, len);
cudaCheckError();
compute_probabilities<<<gridSize, blockSize>>>(d_probabilities, d_statevector, len);
cudaCheckError();
measure<<<1, 1>>>(randomNumbers, d_result, d_probabilities, len, time(NULL));
cudaCheckError();
// Copy the generated random numbers back to the host if needed
float *hostRandomNumbers = (float *)malloc(len * sizeof(float));
cudaMemcpy(hostRandomNumbers, randomNumbers, len * sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < len; i++) {
std::cout << i << ": " << hostRandomNumbers[i] << std::endl;
}
// Copy result back to host
int h_result;
cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
// Convert result to bitstring
std::string bitstring;
for (int i = num_qubits - 1; i >= 0; --i) {
bitstring += ((h_result >> i) & 1) ? '1' : '0';
}
// Clean up
cudaFree(d_statevector);
cudaFree(d_probabilities);
cudaFree(d_result);
cudaFree(d_states);
cudaFree(randomNumbers);
free(hostRandomNumbers);
return binaryStringToInt(bitstring);
}
int main() {
int counts[] = {0, 0, 0, 0};
int shots = 1000;
for(int shot = 0; shot < shots; shot++) {
int result = test_measure();
counts[result] += 1;
}
std::cout << "00: " << counts[0] << " " << 1.0f * counts[0] / shots <<std::endl;
std::cout << "01: " << counts[1] << " " << 1.0f * counts[1] / shots <<std::endl;
std::cout << "10: " << counts[2] << " " << 1.0f * counts[2] / shots <<std::endl;
std::cout << "11: " << counts[3] << " " << 1.0f * counts[3] / shots <<std::endl;
}