James Bryan Graves - Personal blog

Personal blog of James Bryan Graves
Computer science (CS), quantum information science (QIS),
& software development & engineering (SDE)

2024-06-11 QIS/QCS Simulator in Nvidia CUDA, p.2 (en_US)

Working on a quantum universal gate simulator for educational purposes, non-production ready, designed to run on Nvidia hardware via CUDA.

Nvidia also has the CUDA-Q project, something I’m excited to dig into, but I’m more interested in practicing CUDA, quantum computing, & linear algebra from a more raw perspective.

Source code will be developing on Github, with some of the process documented on this blog.

Said “Q Coo”, like “cuckoo” (will create a cute bird logo later). Is a POC, non-production, quantum universal gate simulator designed to run on Nvidia hardward via CUDA. Mainly to assist in educating myself on quantum computer simulation.

Why Nvidia?

I saw Nvidia surpassed Apple in market cap?! Maybe not a bad time to do linear algebra on Nvidia hardware.

I also really have a ♥ affair with my GPU (RTX 3070), which I aquired during the pandemic & supply chain crisis, from a scalper, to whom I drove 2 hours, & paid way too much money (€1500).

Matrix / vector multiplication (y = A * x)

Made some progress yesterday with the following.


#include <iostream>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

// CUDA kernel for matrix-vector multiplication
__global__ void matVecMulKernel(float *A, float *x, float *y, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    if (row < rows) {
        float sum = 0.0f;
        for (int col = 0; col < cols; ++col) {
            sum += A[row * cols + col] * x[col];
        }
        y[row] = sum;
    }
}

void matVecMul(float *h_A, float *h_x, float *h_y, int rows, int cols) {
    float *d_A, *d_x, *d_y;

    // Allocate memory on the device
    cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
    cudaMalloc((void**)&d_x, cols * sizeof(float));
    cudaMalloc((void**)&d_y, rows * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, rows * cols * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, h_x, cols * sizeof(float), cudaMemcpyHostToDevice);

    // Define the grid and block dimensions
    dim3 dimBlock(1, BLOCK_SIZE);
    dim3 dimGrid(1, (rows + BLOCK_SIZE - 1) / BLOCK_SIZE);

    // Launch the kernel
    matVecMulKernel<<<dimGrid, dimBlock>>>(d_A, d_x, d_y, rows, cols);

    // Copy the result back to the host
    cudaMemcpy(h_y, d_y, rows * sizeof(float), cudaMemcpyDeviceToHost);

    // Clean up
    cudaFree(d_A);
    cudaFree(d_x);
    cudaFree(d_y);
}

int main() {
    int rows = 4;
    int cols = 4;

    // Host input matrices
    float h_A[] = {
        1,  0,  1,  0,
        0,  1,  0,  1,
        1,  0, -1,  0,
        0,  1,  0, -1};
    float h_x[] = {1, 0, 0, 0};
    float h_y[4];

    // Matrix-vector multiplication
    matVecMul(h_A, h_x, h_y, rows, cols);

    // Print the result
    for (int i = 0; i < rows; ++i) {
        std::cout << h_y[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}

Tensor Product (A ⊗ B)


#include <cuda_runtime.h>
#include <iostream>

__global__ void tensorProductKernel(float* d_A, float* d_B, float* d_C, int aRows, int aCols, int bRows, int bCols) {
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int totalRows = aRows * bRows;
    int totalCols = aCols * bCols;

    if (i < totalRows && j < totalCols) {
        int rowA = i / bRows;
        int colA = j / bCols;
        int rowB = i % bRows;
        int colB = j % bCols;
        d_C[i * totalCols + j] = d_A[rowA * aCols + colA] * d_B[rowB * bCols + colB];
    }
}

void tensorProduct(float* h_A, float* h_B, float* h_C, int aRows, int aCols, int bRows, int bCols) {
    int aSize = aRows * aCols * sizeof(float);
    int bSize = bRows * bCols * sizeof(float);
    int cSize = aRows * bRows * aCols * bCols * sizeof(float);

    float* d_A;
    float* d_B;
    float* d_C;

    cudaMalloc((void**)&d_A, aSize);
    cudaMalloc((void**)&d_B, bSize);
    cudaMalloc((void**)&d_C, cSize);

    cudaMemcpy(d_A, h_A, aSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bSize, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((aCols * bCols + threadsPerBlock.x - 1) / threadsPerBlock.x, 
                   (aRows * bRows + threadsPerBlock.y - 1) / threadsPerBlock.y);

    tensorProductKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, aRows, aCols, bRows, bCols);

    cudaMemcpy(h_C, d_C, cSize, cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    const int aRows = 2;
    const int aCols = 2;
    const int bRows = 2;
    const int bCols = 2;

    float h_A[aRows * aCols] = {1, 1, 1, -1};
    float h_B[bRows * bCols] = {1, 0, 0, 1};
    float h_C[aRows * bRows * aCols * bCols];

    tensorProduct(h_A, h_B, h_C, aRows, aCols, bRows, bCols);

    for (int i = 0; i < aRows * bRows; ++i) {
        for (int j = 0; j < aCols * bCols; ++j) {
            std::cout << h_C[i * aCols * bCols + j] << " ";
        }
        std::cout << "\n";
    }

    return 0;
}

Checkpoint notes

Neither of these are handling complex numbers, and I would like to pull them into more of a stdlib with a header, and change the prints to “tests”.

Source control

Creating a Github repository for Qcu.

Creating a ed25519 ssh key on Windows

    ssh-keygen -t ed25519

Reminder: Windows version of `cat` is `type`

    type .ssh\id_ed25519.pub

Creating a markdown → html pipeline

Ideally I would be able to convert these notes to HTML for my blog.

Created a simple script at Github, but is basically just Pandoc


#!/bin/bash

pandoc -o html/$1.html md/$1.md

cat html/header.html html/$1.html html/footer.html > html/tmp.html && mv html/tmp.html html/$1.html

Measuring quantum state vectors

Something is wrong with how I’m generating random numbers with CUDA. Will need to visit this later.

Maybe also with normalization? Not sure CUDA is handling the imaginary part correctly. Will also check this.


#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <cuComplex.h>
#include <iostream>
#include <cmath>

// Error checking macro
#define cudaCheckError() {                                           \
    cudaError_t e=cudaGetLastError();                                \
    if(e!=cudaSuccess) {                                             \
        std::cerr << "Cuda failure " << __FILE__ << ":" << __LINE__; \
        std::cerr << " '" << cudaGetErrorString(e) << "'\n";         \
        exit(EXIT_FAILURE);                                          \
    }                                                                \
}

__global__ void setup_kernel(curandState *state, unsigned long long seed)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void generate_random_numbers(curandState *state, float *randomNumbers)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Generate a random number using the curand_uniform function
    float randNum = curand_uniform(&state[idx]);
    
    // Store the random number in the output array
    randomNumbers[idx] = randNum;
}

// Kernel to normalize the statevector
__global__ void normalize(cuFloatComplex* statevector, int len) {
    __shared__ float norm;
    if (threadIdx.x == 0) norm = 0.0f;
    __syncthreads();

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < len) {
        atomicAdd(&norm, cuCabsf(statevector[idx]) * cuCabsf(statevector[idx]));
    }
    __syncthreads();

    if (threadIdx.x == 0) norm = sqrtf(norm);
    __syncthreads();

    if (idx < len) {
        statevector[idx] = make_cuFloatComplex(cuCrealf(statevector[idx]) / norm, cuCimagf(statevector[idx]) / norm);
    }
}

// Kernel to compute the probabilities
__global__ void compute_probabilities(float* probabilities, cuFloatComplex* statevector, int len) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < len) {
        probabilities[idx] = cuCabsf(statevector[idx]) * cuCabsf(statevector[idx]);
    }
}

// Kernel to measure the statevector
__global__ void measure(float *randomNumbers, int* result, float* probabilities, int len, unsigned long long seed) {
    //int idx = blockIdx.x * blockDim.x + threadIdx.x;
    //curandState state;
    //curand_init(seed + idx, 0, 0, &state);
    //float random_number = curand_uniform(&state);

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // // Generate a random number using the curand_uniform function
    float random_number = randomNumbers[idx];

    float cumulative_probability = 0.0f;
    for (int i = 0; i < len; ++i) {
        cumulative_probability += probabilities[i];
        if (random_number < cumulative_probability) {
            *result = i;
            break;
        }
    }
}

void checkNormalization(cuFloatComplex* statevector, int len) {
    float norm = 0.0;
    for (int i = 0; i < len; ++i) {
        norm += cuCabsf(statevector[i]) * cuCabsf(statevector[i]);
    }
    if (fabs(norm - 1.0) > 1e-6) {
        std::cerr << "Statevector must be normalized\n";
        exit(EXIT_FAILURE);
    }
}

int binaryStringToInt(const std::string& binaryStr) {
    int result = 0;
    for (char bit : binaryStr) {
        result <<= 1; // shift left by 1 bit
        result += (bit - '0'); // add the current bit
    }
    return result;
}

int test_measure() {
    int num_qubits = 2;
    int len = 1 << num_qubits;

    cuFloatComplex h_statevector[] = {
        make_cuFloatComplex(0.2, 0.0), make_cuFloatComplex(0.2, 0.0),
        make_cuFloatComplex(0.6, 0.0), make_cuFloatComplex(0.2, 0.0)
    };

    // Calculate the norm of the statevector
    float norm = 0.0f;
    for (int i = 0; i < len; ++i) {
        float real_part = cuCrealf(h_statevector[i]);
        float imag_part = cuCimagf(h_statevector[i]);
        norm += real_part * real_part + imag_part * imag_part;
    }
    norm = sqrtf(norm);

    // Normalize the statevector
    for (int i = 0; i < len; ++i) {
        float real_part = cuCrealf(h_statevector[i]);
        float imag_part = cuCimagf(h_statevector[i]);
        h_statevector[i] = make_cuFloatComplex(real_part / norm, imag_part / norm);
    }

    // Verify normalization
    float sum = 0.0f;
    for (int i = 0; i < len; ++i) {
        float mag = cuCabsf(h_statevector[i]);
        sum += mag * mag;
    }

    // Device memory allocations
    cuFloatComplex* d_statevector;
    float* d_probabilities;
    int* d_result;
    // Allocate memory for random number generator states
    curandState *d_states;
    
    cudaMalloc(&d_statevector, len * sizeof(cuFloatComplex));
    cudaMalloc(&d_probabilities, len * sizeof(float));
    cudaMalloc(&d_result, sizeof(int));
    cudaMalloc((void **)&d_states, len * sizeof(curandState));

    // Copy statevector to device
    cudaMemcpy(d_statevector, h_statevector, len * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);

    // Kernel launches
    int blockSize = 256;
    int gridSize = (len + blockSize - 1) / blockSize;

    // Generate a random seed
    unsigned long long seed = time(NULL);

    // Setup the kernel with random states
    setup_kernel<<<gridSize, blockSize>>>(d_states, seed);
    cudaCheckError();

    // Allocate memory for the random numbers generated
    float *randomNumbers;
    cudaMalloc((void **)&randomNumbers, len * sizeof(float));

    // Generate random numbers using the initialized states
    generate_random_numbers<<<gridSize, blockSize>>>(d_states, randomNumbers);
    cudaCheckError();

    normalize<<<gridSize, blockSize>>>(d_statevector, len);
    cudaCheckError();

    compute_probabilities<<<gridSize, blockSize>>>(d_probabilities, d_statevector, len);
    cudaCheckError();

    measure<<<1, 1>>>(randomNumbers, d_result, d_probabilities, len, time(NULL));
    cudaCheckError();

    // Copy the generated random numbers back to the host if needed
    float *hostRandomNumbers = (float *)malloc(len * sizeof(float));
    cudaMemcpy(hostRandomNumbers, randomNumbers, len * sizeof(float), cudaMemcpyDeviceToHost);

    for(int i = 0; i < len; i++) {
        std::cout << i << ": " << hostRandomNumbers[i] << std::endl;
    }

    // Copy result back to host
    int h_result;
    cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

    // Convert result to bitstring
    std::string bitstring;
    for (int i = num_qubits - 1; i >= 0; --i) {
        bitstring += ((h_result >> i) & 1) ? '1' : '0';
    }

    // Clean up
    cudaFree(d_statevector);
    cudaFree(d_probabilities);
    cudaFree(d_result);
    cudaFree(d_states);
    cudaFree(randomNumbers);

    free(hostRandomNumbers);

    return binaryStringToInt(bitstring);
}


int main() {
    int counts[] = {0, 0, 0, 0};
    int shots = 1000;
    for(int shot = 0; shot < shots; shot++) {
        int result = test_measure();
        counts[result] += 1;
    }
    std::cout << "00: " << counts[0] << " " << 1.0f * counts[0] / shots <<std::endl;
    std::cout << "01: " << counts[1] << " " << 1.0f * counts[1] / shots <<std::endl;
    std::cout << "10: " << counts[2] << " " << 1.0f * counts[2] / shots <<std::endl;
    std::cout << "11: " << counts[3] << " " << 1.0f * counts[3] / shots <<std::endl;
}