From 5e65253e4134bafa78a04c65429a72c4ffa98cda Mon Sep 17 00:00:00 2001 From: Grigore Lupescu <grigore.lupescu@gmail.com> Date: Mon, 15 May 2023 22:04:40 +0300 Subject: [PATCH] Add 3-parallel-hashtable assignment --- assignments/3-parallel-hashtable/Makefile | 11 + assignments/3-parallel-hashtable/README.md | 1 + assignments/3-parallel-hashtable/bench.py | 94 +++++++ .../3-parallel-hashtable/src/gpu_hashtable.cu | 57 ++++ .../src/gpu_hashtable.hpp | 19 ++ assignments/3-parallel-hashtable/test_map.cu | 256 ++++++++++++++++++ assignments/3-parallel-hashtable/test_map.hpp | 36 +++ 7 files changed, 474 insertions(+) create mode 100644 assignments/3-parallel-hashtable/Makefile create mode 100644 assignments/3-parallel-hashtable/README.md create mode 100644 assignments/3-parallel-hashtable/bench.py create mode 100644 assignments/3-parallel-hashtable/src/gpu_hashtable.cu create mode 100644 assignments/3-parallel-hashtable/src/gpu_hashtable.hpp create mode 100644 assignments/3-parallel-hashtable/test_map.cu create mode 100644 assignments/3-parallel-hashtable/test_map.hpp diff --git a/assignments/3-parallel-hashtable/Makefile b/assignments/3-parallel-hashtable/Makefile new file mode 100644 index 0000000..7fe1e1c --- /dev/null +++ b/assignments/3-parallel-hashtable/Makefile @@ -0,0 +1,11 @@ +all: gpu_hashtable + +gpu_hashtable: src/*.cu + nvcc -O2 -g -std=c++11 -I. -I./src *.cu ./src/*.cu -o gpu_hashtable + +run: + python bench.py + +clean: + rm -rf gpu_hashtable + rm -rf slurm-* diff --git a/assignments/3-parallel-hashtable/README.md b/assignments/3-parallel-hashtable/README.md new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/assignments/3-parallel-hashtable/README.md @@ -0,0 +1 @@ +TODO diff --git a/assignments/3-parallel-hashtable/bench.py b/assignments/3-parallel-hashtable/bench.py new file mode 100644 index 0000000..0dd1705 --- /dev/null +++ b/assignments/3-parallel-hashtable/bench.py @@ -0,0 +1,94 @@ +import traceback +import sys, os, subprocess, signal +import subprocess as sp + +testConfig = [ + { + "name" : "T1", + "entries" : 10 ** 6, + "chunks" : 2, + "speed" : 0, + "points" : 15 + }, + { + "name" : "T2", + "entries" : 2 * 10 ** 6, + "chunks" : 2, + "speed" : 20, + "points" : 15 + }, + { + "name" : "T3", + "entries" : 4 * 10 ** 6, + "chunks" : 4, + "speed" : 40, + "points" : 15 + }, + { + "name" : "T4", + "entries" : 8 * 10 ** 7, + "chunks" : 4, + "speed" : 50, + "points" : 15 + }, + { + "name" : "T5", + "entries" : 10 ** 8, + "chunks" : 2, + "speed" : 50, + "points" : 10 + }, + { + "name" : "T6", + "entries" : 10 ** 8, + "chunks" : 10, + "speed" : 50, + "points" : 10 + }, +] + +outfile = open('output', 'w') + +if len(sys.argv) > 1: + benchtypes = sys.argv[1:] +else: + benchtypes = ('0', '1') + +hwMaxPoints = 0 +hwPoints = 0 + +for testEntry in testConfig: + + testName = testEntry["name"] + testEntries = str(testEntry["entries"]) + testChunks = str(testEntry["chunks"]) + testSpeed = str(testEntry["speed"]) + testPoints = testEntry["points"] + hwMaxPoints += testPoints + + print "\n\n\n------- Test", testName, "START\t----------\n" + + try: + child = sp.Popen( ['./gpu_hashtable', testEntries, testChunks, testSpeed], stdout=sp.PIPE ) + output = child.communicate()[0] + lines = str(output).split("\n") + print( output ) + rc = child.poll() + if rc == 1: + print "------- Test", testName, "END\t---------- \t [ FAILED ]" + continue + + except Exception: + + print "------- Test", testName, "END\t---------- \t [ FAILED ]" + traceback.print_exc() + print "Error with", str(['./src/gpu_hashtable', testEntries, testChunks, testSpeed]) + continue + + print "------- Test ", testName, "END\t---------- \t [ OK RESULT: ", testPoints, " pts ]" + hwPoints = hwPoints + testPoints + + print "\nTotal so far: ", hwPoints, "/", 80 + +print "\nTotal:", hwPoints, "/", 80 + diff --git a/assignments/3-parallel-hashtable/src/gpu_hashtable.cu b/assignments/3-parallel-hashtable/src/gpu_hashtable.cu new file mode 100644 index 0000000..375aadc --- /dev/null +++ b/assignments/3-parallel-hashtable/src/gpu_hashtable.cu @@ -0,0 +1,57 @@ +#include <iostream> +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctime> +#include <sstream> +#include <string> +#include "test_map.hpp" +#include "gpu_hashtable.hpp" + +using namespace std; + +/* +Allocate CUDA memory only through glbGpuAllocator +cudaMalloc -> glbGpuAllocator->_cudaMalloc +cudaMallocManaged -> glbGpuAllocator->_cudaMallocManaged +cudaFree -> glbGpuAllocator->_cudaFree +*/ + +/** + * Function constructor GpuHashTable + * Performs init + * Example on using wrapper allocators _cudaMalloc and _cudaFree + */ +GpuHashTable::GpuHashTable(int size) { +} + +/** + * Function desctructor GpuHashTable + */ +GpuHashTable::~GpuHashTable() { +} + +/** + * Function reshape + * Performs resize of the hashtable based on load factor + */ +void GpuHashTable::reshape(int numBucketsReshape) { + return; +} + +/** + * Function insertBatch + * Inserts a batch of key:value, using GPU and wrapper allocators + */ +bool GpuHashTable::insertBatch(int *keys, int* values, int numKeys) { + return true; +} + +/** + * Function getBatch + * Gets a batch of key:value, using GPU + */ +int* GpuHashTable::getBatch(int* keys, int numKeys) { + return 0; +} diff --git a/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp b/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp new file mode 100644 index 0000000..f75e014 --- /dev/null +++ b/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp @@ -0,0 +1,19 @@ +#ifndef _HASHCPU_ +#define _HASHCPU_ + +/** + * Class GpuHashTable to implement functions + */ +class GpuHashTable +{ + public: + GpuHashTable(int size); + void reshape(int sizeReshape); + + bool insertBatch(int *keys, int* values, int numKeys); + int* getBatch(int* key, int numItems); + + ~GpuHashTable(); +}; + +#endif diff --git a/assignments/3-parallel-hashtable/test_map.cu b/assignments/3-parallel-hashtable/test_map.cu new file mode 100644 index 0000000..18da244 --- /dev/null +++ b/assignments/3-parallel-hashtable/test_map.cu @@ -0,0 +1,256 @@ +#include <algorithm> +#include <array> +#include <chrono> +#include <errno.h> +#include <fstream> +#include <inttypes.h> +#include <iomanip> +#include <iostream> +#include <math.h> +#include <random> +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <vector> + +#include <cuda_runtime_api.h> +#include <cuda.h> + +#include "gpu_hashtable.hpp" +#include "test_map.hpp" + +using namespace std; +using namespace chrono; + +GpuAllocator *glbGpuAllocator = nullptr; + +GpuAllocator::GpuAllocator(long long int allocMax) { + this->allocMax = allocMax; + this->allocCurrent = 0; +} + +/** + * Wrapper _cudaMalloc + * @return cudaMalloc allocation if within bounds/limits + */ +cudaError_t GpuAllocator::_cudaMalloc( void** devPtr, size_t size ) { + + if ( (this->allocCurrent + size) <= this->allocMax ) { + cudaError_t rt = cudaMalloc(devPtr, size); + this->allocCurrent += size; + this->allocMap[ *devPtr ] = size; + return rt; + } else { + DIE(true, "cudaMalloc would exceed allowed max alloc size" ); + } +} + +/** + * Wrapper _cudaMallocManaged + * @return cudaMallocManaged allocation if within bounds/limits + */ +cudaError_t GpuAllocator::_cudaMallocManaged( void** devPtr, size_t size ) { + + if ( (this->allocCurrent + size) <= this->allocMax ) { + cudaError_t rt = cudaMallocManaged(devPtr, size); + this->allocCurrent += size; + this->allocMap[ *devPtr ] = size; + return rt; + } else { + DIE(true, "cudaMallocManaged would exceed allowed max alloc size" ); + } +} + +/** + * Wrapper _cudaFree + * @return cudaFree allocation if ptr found (allocated via wrapper) + */ +cudaError_t GpuAllocator::_cudaFree( void* devPtr ) { + + if (this->allocMap.find(devPtr) == this->allocMap.end()) { + return cudaErrorInvalidValue; + } else { + this->allocCurrent -= this->allocMap.find(devPtr)->second; + this->allocMap.erase(devPtr); + return cudaFree(devPtr); + } +} + +/** + * Wrapper _used + * @return current allocated memory via wrappers + */ +long long int GpuAllocator::_used( void ) { + return this->allocCurrent; +} + +void fillRandom(vector<int> &vecKeys, vector<int> &vecValues, int numEntries) { + vecKeys.reserve(numEntries); + vecValues.reserve(numEntries); + + int interval = (numeric_limits<int>::max() / numEntries) - 1; + default_random_engine generator; + uniform_int_distribution<int> distribution(1, interval); + + for(int i = 0; i < numEntries; i++) { + vecKeys.push_back(interval * i + distribution(generator)); + vecValues.push_back(interval * i + distribution(generator)); + } + + random_shuffle(vecKeys.begin(), vecKeys.end()); + random_shuffle(vecValues.begin(), vecValues.end()); +} + +/** + * Main description. + * @param argv1 is total number of hash inserts/gets + * @param argv2 is number of sets, (total/nrsets) is a chunk + * @param argv3 is min performance measured in M/sec for INSERT/GET + * @return different than 0 means run has failed + */ +int main(int argc, char **argv) +{ + clock_t begin; + double elapsedTime; + + int numKeys = 0; + int numChunks = 0; + int minSpeed = 100; + vector<int> vecKeys; + vector<int> vecValues; + int *valuesGot = NULL; + + DIE(argc != 4, + "ERR, args num, call ./bin test_numKeys test_numChunks hash_speed"); + + numKeys = stoll(argv[1]); + DIE((numKeys < 1) || (numKeys >= numeric_limits<int>::max()), + "ERR, numKeys should be greater or equal to 1 and less than maxint"); + + numChunks = stoll(argv[2]); + DIE((numChunks < 1) || (numChunks >= numKeys), + "ERR, numChunks should be greater or equal to 1"); + + minSpeed = stoll(argv[3]); + DIE((minSpeed < 0) || (minSpeed >= 500), + "ERR, minSpeed should be between 0 and 500"); + + float loadFactorMin = 0.5f; + float loadFactorMax = 1.0f; + + float speedGet = 0.f; + float speedInsert = 0.f; + + fillRandom(vecKeys, vecValues, numKeys); + + glbGpuAllocator = new GpuAllocator(numKeys * sizeof(int) * 2 * 4); + + GpuHashTable gHashTable(1); + + int inserted = 0; + int chunkSize = numKeys / numChunks; + gHashTable.reshape(chunkSize); + + // perform INSERT and test performance + for(int chunkStart = 0; chunkStart < numKeys; chunkStart += chunkSize) { + + int *keysStart = &vecKeys[chunkStart]; + int *valuesStart = &vecValues[chunkStart]; + + auto start = high_resolution_clock::now(); + + // INSERT stage + gHashTable.insertBatch(keysStart, valuesStart, chunkSize); + inserted += chunkSize; + + auto stop = high_resolution_clock::now(); + elapsedTime = duration_cast<microseconds>(stop - start).count(); + + float speed = chunkSize / elapsedTime; + float hashLoadFactor = (float) inserted * sizeof(int) * 2.f / glbGpuAllocator->_used(); + + // check load factor + DIE( loadFactorMin > hashLoadFactor, "loadFactorMin > hashLoadFactor"); + DIE( loadFactorMax < hashLoadFactor, "loadFactorMax < hashLoadFactor"); + + cout << setw(20) << left << "HASH_BATCH_INSERT" + << setw(24) << left << "count: " + to_string(chunkSize) + << setw(24) << left << "speed: " + to_string( (int)speed ) + "M/sec" + << setw(24) << left + << "loadfactor: " + to_string( (int)(hashLoadFactor * 100.f) ) + "%" << endl; + + speedInsert += speed; + } + + + // perform INSERT for update validation + int chunkSizeUpdate = min(64, numKeys); + for(int chunkStart = 0; chunkStart < chunkSizeUpdate; chunkStart++) { + vecValues[chunkStart] += 1111111 + chunkStart; + } + + gHashTable.insertBatch(&vecKeys[0], &vecValues[0], chunkSizeUpdate); + + // perform GET and test performance + for(int chunkStart = 0; chunkStart < numKeys; chunkStart += chunkSize) { + + int *keysStart = &vecKeys[chunkStart]; + + auto start = high_resolution_clock::now(); + + // GET stage + valuesGot = gHashTable.getBatch(keysStart, chunkSize); + + auto stop = high_resolution_clock::now(); + elapsedTime = duration_cast<microseconds>(stop - start).count(); + + float speed = chunkSize / elapsedTime; + float hashLoadFactor = (float) inserted * sizeof(int) * 2.f / glbGpuAllocator->_used(); + + cout << setw(20) << left << "HASH_BATCH_GET" + << setw(24) << left << "count: " + to_string(chunkSize) + << setw(24) << left << "speed: " + to_string( (int)speed ) + "M/sec" + << setw(24) << left + << "loadfactor: " + to_string( (int)(hashLoadFactor * 100.f) ) + "%" << endl; + + // check load factor + DIE( loadFactorMin > hashLoadFactor, "loadFactorMin > hashLoadFactor" ); + DIE( loadFactorMax < hashLoadFactor, "loadFactorMax < hashLoadFactor" ); + DIE( valuesGot == NULL, "ERR, ptr valuesCheck cannot be NULL" ); + + speedGet += speed; + + int mistmatches = 0; + for(int i = 0; i < chunkSize; i++) { + if(vecValues[chunkStart + i] != valuesGot[i]) { + mistmatches++; + if(mistmatches < 32) { + cout << "Expected " << vecValues[chunkStart + i] + << ", but got " << valuesGot[i] << " for key:" << keysStart[i] << endl; + } + } + } + + if(mistmatches > 0) { + cout << "ERR, mistmatches: " << mistmatches << " / " << numKeys << endl; + exit(1); + } + } + + float avgSpeedInsert = speedInsert / numChunks; + float avgSpeedGet = speedGet / numChunks; + + cout << "----------------------------------------------" << endl; + cout << setw(24) << left << "AVG_INSERT: " + to_string( (int)avgSpeedInsert ) + " M/sec," + << setw(24) << left<< "AVG_GET: " + to_string( (int)avgSpeedGet ) + " M/sec," + << setw(24) << left<< "MIN_SPEED_REQ: " + to_string(minSpeed) + " M/sec" << endl; + + DIE( minSpeed > avgSpeedGet, "minSpeed > avgSpeedGet" ); + DIE( minSpeed > avgSpeedInsert, "minSpeed > avgSpeedInsert" ); + + cout << endl; + + return 0; +} diff --git a/assignments/3-parallel-hashtable/test_map.hpp b/assignments/3-parallel-hashtable/test_map.hpp new file mode 100644 index 0000000..b1110d2 --- /dev/null +++ b/assignments/3-parallel-hashtable/test_map.hpp @@ -0,0 +1,36 @@ +#include <unordered_map> + +using namespace std; + +#define MB_1 (1024 * 1024) + +#define KEY_INVALID 0 +#define DIE(assertion, call_description) \ + do { \ + if (assertion) { \ + fprintf(stderr, "(%s, %d): ", \ + __FILE__, __LINE__); \ + perror(call_description); \ + exit(1); \ + } \ +} while (0) + +class GpuAllocator +{ + long long int allocMax; + long long int allocCurrent; + + std::unordered_map<void*, int> allocMap; + +public: + GpuAllocator(long long int allocMax); + + cudaError_t _cudaMalloc( void** devPtr, size_t size ); + cudaError_t _cudaMallocManaged( void** devPtr, size_t size ); + cudaError_t _cudaFree( void* devPtr ); + + float _loadFactor( void ); + long long int _used( void ); +}; + +extern GpuAllocator *glbGpuAllocator; -- GitLab