From 5e65253e4134bafa78a04c65429a72c4ffa98cda Mon Sep 17 00:00:00 2001
From: Grigore Lupescu <grigore.lupescu@gmail.com>
Date: Mon, 15 May 2023 22:04:40 +0300
Subject: [PATCH] Add 3-parallel-hashtable assignment

---
 assignments/3-parallel-hashtable/Makefile     |  11 +
 assignments/3-parallel-hashtable/README.md    |   1 +
 assignments/3-parallel-hashtable/bench.py     |  94 +++++++
 .../3-parallel-hashtable/src/gpu_hashtable.cu |  57 ++++
 .../src/gpu_hashtable.hpp                     |  19 ++
 assignments/3-parallel-hashtable/test_map.cu  | 256 ++++++++++++++++++
 assignments/3-parallel-hashtable/test_map.hpp |  36 +++
 7 files changed, 474 insertions(+)
 create mode 100644 assignments/3-parallel-hashtable/Makefile
 create mode 100644 assignments/3-parallel-hashtable/README.md
 create mode 100644 assignments/3-parallel-hashtable/bench.py
 create mode 100644 assignments/3-parallel-hashtable/src/gpu_hashtable.cu
 create mode 100644 assignments/3-parallel-hashtable/src/gpu_hashtable.hpp
 create mode 100644 assignments/3-parallel-hashtable/test_map.cu
 create mode 100644 assignments/3-parallel-hashtable/test_map.hpp

diff --git a/assignments/3-parallel-hashtable/Makefile b/assignments/3-parallel-hashtable/Makefile
new file mode 100644
index 0000000..7fe1e1c
--- /dev/null
+++ b/assignments/3-parallel-hashtable/Makefile
@@ -0,0 +1,11 @@
+all: gpu_hashtable
+
+gpu_hashtable: src/*.cu
+	nvcc -O2 -g -std=c++11 -I. -I./src *.cu ./src/*.cu -o gpu_hashtable
+
+run:
+	python bench.py
+
+clean:
+	rm -rf gpu_hashtable
+	rm -rf slurm-*
diff --git a/assignments/3-parallel-hashtable/README.md b/assignments/3-parallel-hashtable/README.md
new file mode 100644
index 0000000..1333ed7
--- /dev/null
+++ b/assignments/3-parallel-hashtable/README.md
@@ -0,0 +1 @@
+TODO
diff --git a/assignments/3-parallel-hashtable/bench.py b/assignments/3-parallel-hashtable/bench.py
new file mode 100644
index 0000000..0dd1705
--- /dev/null
+++ b/assignments/3-parallel-hashtable/bench.py
@@ -0,0 +1,94 @@
+import traceback
+import sys, os, subprocess, signal
+import subprocess as sp
+
+testConfig = [
+    {
+        "name" : "T1",
+        "entries" : 10 ** 6,
+        "chunks" : 2,
+        "speed" : 0,
+        "points" : 15
+    },
+    {
+        "name" : "T2",
+        "entries" : 2 * 10 ** 6,
+        "chunks" : 2,
+        "speed" : 20,
+        "points" : 15
+    },
+    {
+        "name" : "T3",
+        "entries" : 4 * 10 ** 6,
+        "chunks" : 4,
+        "speed" : 40,
+        "points" : 15
+    },
+    {
+        "name" : "T4",
+        "entries" : 8 * 10 ** 7,
+        "chunks" : 4,
+        "speed" : 50,
+        "points" : 15
+     },
+     {
+        "name" : "T5",
+        "entries" : 10 ** 8,
+        "chunks" : 2,
+        "speed" : 50,
+        "points" : 10
+     },
+     {
+        "name" : "T6",
+        "entries" : 10 ** 8,
+        "chunks" : 10,
+        "speed" : 50,
+        "points" : 10
+     },
+]
+
+outfile = open('output', 'w')
+
+if len(sys.argv) > 1:
+    benchtypes = sys.argv[1:]
+else:
+    benchtypes = ('0', '1')
+
+hwMaxPoints = 0
+hwPoints = 0
+
+for testEntry in testConfig:
+
+    testName = testEntry["name"]
+    testEntries = str(testEntry["entries"])
+    testChunks = str(testEntry["chunks"])
+    testSpeed = str(testEntry["speed"])
+    testPoints = testEntry["points"]
+    hwMaxPoints += testPoints
+
+    print "\n\n\n------- Test", testName, "START\t----------\n" 
+
+    try:
+        child = sp.Popen( ['./gpu_hashtable', testEntries, testChunks, testSpeed], stdout=sp.PIPE )
+        output = child.communicate()[0]
+        lines = str(output).split("\n")
+        print( output )
+        rc = child.poll()
+        if rc == 1:
+            print "------- Test", testName, "END\t---------- \t [ FAILED ]"
+            continue
+
+    except Exception:
+
+        print "------- Test", testName, "END\t---------- \t [ FAILED ]"
+        traceback.print_exc()
+        print "Error with",  str(['./src/gpu_hashtable', testEntries, testChunks, testSpeed])
+        continue
+
+    print "------- Test ", testName, "END\t---------- \t [ OK RESULT: ", testPoints, " pts ]"
+    hwPoints = hwPoints + testPoints
+
+    print "\nTotal so far: ", hwPoints, "/",  80
+
+print "\nTotal:", hwPoints, "/",  80
+
diff --git a/assignments/3-parallel-hashtable/src/gpu_hashtable.cu b/assignments/3-parallel-hashtable/src/gpu_hashtable.cu
new file mode 100644
index 0000000..375aadc
--- /dev/null
+++ b/assignments/3-parallel-hashtable/src/gpu_hashtable.cu
@@ -0,0 +1,57 @@
+#include <iostream>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctime>
+#include <sstream>
+#include <string>
+#include "test_map.hpp"
+#include "gpu_hashtable.hpp"
+
+using namespace std;
+
+/*
+Allocate CUDA memory only through glbGpuAllocator
+cudaMalloc -> glbGpuAllocator->_cudaMalloc
+cudaMallocManaged -> glbGpuAllocator->_cudaMallocManaged
+cudaFree -> glbGpuAllocator->_cudaFree
+*/
+
+/**
+ * Function constructor GpuHashTable
+ * Performs init
+ * Example on using wrapper allocators _cudaMalloc and _cudaFree
+ */
+GpuHashTable::GpuHashTable(int size) {
+}
+
+/**
+ * Function desctructor GpuHashTable
+ */
+GpuHashTable::~GpuHashTable() {
+}
+
+/**
+ * Function reshape
+ * Performs resize of the hashtable based on load factor
+ */
+void GpuHashTable::reshape(int numBucketsReshape) {
+	return;
+}
+
+/**
+ * Function insertBatch
+ * Inserts a batch of key:value, using GPU and wrapper allocators
+ */
+bool GpuHashTable::insertBatch(int *keys, int* values, int numKeys) {
+	return true;
+}
+
+/**
+ * Function getBatch
+ * Gets a batch of key:value, using GPU
+ */
+int* GpuHashTable::getBatch(int* keys, int numKeys) {
+	return 0;
+}
diff --git a/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp b/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp
new file mode 100644
index 0000000..f75e014
--- /dev/null
+++ b/assignments/3-parallel-hashtable/src/gpu_hashtable.hpp
@@ -0,0 +1,19 @@
+#ifndef _HASHCPU_
+#define _HASHCPU_
+
+/**
+ * Class GpuHashTable to implement functions
+ */
+class GpuHashTable
+{
+	public:
+		GpuHashTable(int size);
+		void reshape(int sizeReshape);
+
+		bool insertBatch(int *keys, int* values, int numKeys);
+		int* getBatch(int* key, int numItems);
+
+		~GpuHashTable();
+};
+
+#endif
diff --git a/assignments/3-parallel-hashtable/test_map.cu b/assignments/3-parallel-hashtable/test_map.cu
new file mode 100644
index 0000000..18da244
--- /dev/null
+++ b/assignments/3-parallel-hashtable/test_map.cu
@@ -0,0 +1,256 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <errno.h>
+#include <fstream>
+#include <inttypes.h>
+#include <iomanip>
+#include <iostream>
+#include <math.h>
+#include <random>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+
+#include "gpu_hashtable.hpp"
+#include "test_map.hpp"
+
+using namespace std;
+using namespace chrono;
+
+GpuAllocator *glbGpuAllocator = nullptr;
+
+GpuAllocator::GpuAllocator(long long int allocMax) {
+	this->allocMax = allocMax;
+	this->allocCurrent = 0;
+}
+
+/**
+ * Wrapper _cudaMalloc
+ * @return cudaMalloc allocation if within bounds/limits
+ */
+cudaError_t GpuAllocator::_cudaMalloc( void** devPtr, size_t size ) {
+
+	if ( (this->allocCurrent + size) <= this->allocMax ) {
+		cudaError_t rt = cudaMalloc(devPtr, size);
+		this->allocCurrent += size;
+		this->allocMap[ *devPtr ] = size;
+		return rt;
+	} else {
+		DIE(true, "cudaMalloc would exceed allowed max alloc size" );
+	}
+}
+
+/**
+ * Wrapper _cudaMallocManaged
+ * @return cudaMallocManaged allocation if within bounds/limits
+ */
+cudaError_t GpuAllocator::_cudaMallocManaged( void** devPtr, size_t size ) {
+
+	if ( (this->allocCurrent + size) <= this->allocMax ) {
+		cudaError_t rt = cudaMallocManaged(devPtr, size);
+		this->allocCurrent += size;
+		this->allocMap[ *devPtr ] = size;
+		return rt;
+	} else {
+		DIE(true, "cudaMallocManaged would exceed allowed max alloc size" );
+	}
+}
+
+/**
+ * Wrapper _cudaFree
+ * @return cudaFree allocation if ptr found (allocated via wrapper)
+ */
+cudaError_t GpuAllocator::_cudaFree( void* devPtr ) {
+
+	if (this->allocMap.find(devPtr) == this->allocMap.end()) {
+		return cudaErrorInvalidValue;
+	} else {
+		this->allocCurrent -= this->allocMap.find(devPtr)->second;
+		this->allocMap.erase(devPtr);
+		return cudaFree(devPtr);
+	}
+}
+
+/**
+ * Wrapper _used
+ * @return current allocated memory via wrappers
+ */
+long long int GpuAllocator::_used( void ) {
+	return this->allocCurrent;
+}
+
+void fillRandom(vector<int> &vecKeys, vector<int> &vecValues, int numEntries) {
+	vecKeys.reserve(numEntries);
+	vecValues.reserve(numEntries);
+
+	int interval = (numeric_limits<int>::max() / numEntries) - 1;
+	default_random_engine generator;
+	uniform_int_distribution<int> distribution(1, interval);
+
+	for(int i = 0; i < numEntries; i++) {
+		vecKeys.push_back(interval * i + distribution(generator));
+		vecValues.push_back(interval * i + distribution(generator));
+	}
+
+	random_shuffle(vecKeys.begin(), vecKeys.end());
+	random_shuffle(vecValues.begin(), vecValues.end());
+}
+
+/**
+ * Main description.
+ * @param argv1 is total number of hash inserts/gets
+ * @param argv2 is number of sets, (total/nrsets) is a chunk
+ * @param argv3 is min performance measured in M/sec for INSERT/GET
+ * @return different than 0 means run has failed
+ */
+int main(int argc, char **argv)
+{
+	clock_t begin;
+	double elapsedTime;
+
+	int numKeys = 0;
+	int numChunks = 0;
+	int minSpeed = 100;
+	vector<int> vecKeys;
+	vector<int> vecValues;
+	int *valuesGot = NULL;
+
+	DIE(argc != 4,
+		"ERR, args num, call ./bin test_numKeys test_numChunks hash_speed");
+
+	numKeys = stoll(argv[1]);
+	DIE((numKeys < 1) || (numKeys >= numeric_limits<int>::max()),
+		"ERR, numKeys should be greater or equal to 1 and less than maxint");
+
+	numChunks = stoll(argv[2]);
+	DIE((numChunks < 1) || (numChunks >= numKeys),
+		"ERR, numChunks should be greater or equal to 1");
+
+	minSpeed = stoll(argv[3]);
+	DIE((minSpeed < 0) || (minSpeed >= 500),
+		"ERR, minSpeed should be between 0 and 500");
+
+	float loadFactorMin = 0.5f;
+	float loadFactorMax = 1.0f;
+
+	float speedGet = 0.f;
+	float speedInsert = 0.f;
+
+	fillRandom(vecKeys, vecValues, numKeys);
+
+	glbGpuAllocator = new GpuAllocator(numKeys * sizeof(int) * 2 * 4);
+
+	GpuHashTable gHashTable(1);
+
+	int inserted = 0;
+	int chunkSize = numKeys / numChunks;
+	gHashTable.reshape(chunkSize);
+
+	// perform INSERT and test performance
+	for(int chunkStart = 0; chunkStart < numKeys; chunkStart += chunkSize) {
+
+		int *keysStart = &vecKeys[chunkStart];
+		int *valuesStart = &vecValues[chunkStart];
+
+		auto start = high_resolution_clock::now();
+
+		// INSERT stage
+		gHashTable.insertBatch(keysStart, valuesStart, chunkSize);
+		inserted += chunkSize;
+
+		auto stop = high_resolution_clock::now();
+		elapsedTime = duration_cast<microseconds>(stop - start).count();
+
+		float speed = chunkSize / elapsedTime;
+		float hashLoadFactor = (float) inserted * sizeof(int) * 2.f / glbGpuAllocator->_used();
+
+		// check load factor
+		DIE( loadFactorMin > hashLoadFactor, "loadFactorMin > hashLoadFactor");
+		DIE( loadFactorMax < hashLoadFactor, "loadFactorMax < hashLoadFactor");
+
+		cout << setw(20) << left << "HASH_BATCH_INSERT"
+		<< setw(24) << left << "count: " + to_string(chunkSize)
+		<< setw(24) << left << "speed: " + to_string( (int)speed ) + "M/sec"
+		<< setw(24) << left
+		<< "loadfactor: " + to_string( (int)(hashLoadFactor * 100.f) ) + "%" << endl;
+
+		speedInsert += speed;
+	}
+
+
+	// perform INSERT for update validation
+	int chunkSizeUpdate = min(64, numKeys);
+	for(int chunkStart = 0; chunkStart < chunkSizeUpdate; chunkStart++) {
+		vecValues[chunkStart] += 1111111 + chunkStart;
+	}
+
+	gHashTable.insertBatch(&vecKeys[0], &vecValues[0], chunkSizeUpdate);
+
+	// perform GET and test performance
+	for(int chunkStart = 0; chunkStart < numKeys; chunkStart += chunkSize) {
+
+		int *keysStart = &vecKeys[chunkStart];
+
+		auto start = high_resolution_clock::now();
+
+		// GET stage
+		valuesGot = gHashTable.getBatch(keysStart, chunkSize);
+
+		auto stop = high_resolution_clock::now();
+		elapsedTime = duration_cast<microseconds>(stop - start).count();
+
+		float speed = chunkSize / elapsedTime;
+		float hashLoadFactor = (float) inserted * sizeof(int) * 2.f / glbGpuAllocator->_used();
+
+		cout << setw(20) << left << "HASH_BATCH_GET"
+		<< setw(24) << left << "count: " + to_string(chunkSize)
+		<< setw(24) << left << "speed: " + to_string( (int)speed ) + "M/sec"
+		<< setw(24) << left
+		<< "loadfactor: " + to_string( (int)(hashLoadFactor * 100.f) ) + "%" << endl;
+
+		// check load factor
+		DIE( loadFactorMin > hashLoadFactor, "loadFactorMin > hashLoadFactor" );
+		DIE( loadFactorMax < hashLoadFactor, "loadFactorMax < hashLoadFactor" );
+		DIE( valuesGot == NULL, "ERR, ptr valuesCheck cannot be NULL" );
+
+		speedGet += speed;
+
+		int mistmatches = 0;
+		for(int i = 0; i < chunkSize; i++) {
+			if(vecValues[chunkStart + i] != valuesGot[i]) {
+				mistmatches++;
+				if(mistmatches < 32) {
+					cout << "Expected " << vecValues[chunkStart + i]
+					<< ", but got " << valuesGot[i] << " for key:" << keysStart[i] << endl;
+				}
+			}
+		}
+
+		if(mistmatches > 0) {
+			cout << "ERR, mistmatches: " << mistmatches << " / " << numKeys << endl;
+			exit(1);
+		}
+	}
+
+	float avgSpeedInsert = speedInsert / numChunks;
+	float avgSpeedGet = speedGet / numChunks;
+
+	cout << "----------------------------------------------" << endl;
+	cout << setw(24) << left << "AVG_INSERT: " + to_string( (int)avgSpeedInsert ) + " M/sec,"
+		<< setw(24) << left<< "AVG_GET: " + to_string( (int)avgSpeedGet ) + " M/sec,"
+		<< setw(24) << left<< "MIN_SPEED_REQ: " +  to_string(minSpeed) + " M/sec" << endl;
+
+	DIE( minSpeed > avgSpeedGet, "minSpeed > avgSpeedGet" );
+	DIE( minSpeed > avgSpeedInsert, "minSpeed > avgSpeedInsert" );
+
+	cout << endl;
+
+	return 0;
+}
diff --git a/assignments/3-parallel-hashtable/test_map.hpp b/assignments/3-parallel-hashtable/test_map.hpp
new file mode 100644
index 0000000..b1110d2
--- /dev/null
+++ b/assignments/3-parallel-hashtable/test_map.hpp
@@ -0,0 +1,36 @@
+#include <unordered_map>
+
+using namespace std;
+
+#define MB_1	(1024 * 1024)
+
+#define	KEY_INVALID		0
+#define DIE(assertion, call_description) \
+	do {	\
+		if (assertion) {	\
+		fprintf(stderr, "(%s, %d): ",	\
+		__FILE__, __LINE__);	\
+		perror(call_description);	\
+		exit(1);	\
+	}	\
+} while (0)
+
+class GpuAllocator
+{
+    long long int allocMax;
+    long long int allocCurrent;
+
+	std::unordered_map<void*, int> allocMap;
+
+public:
+    GpuAllocator(long long int allocMax);
+
+    cudaError_t _cudaMalloc( void** devPtr, size_t size );
+    cudaError_t _cudaMallocManaged( void** devPtr, size_t size );
+    cudaError_t _cudaFree( void* devPtr );
+
+    float _loadFactor( void );
+    long long int _used( void );
+};
+
+extern GpuAllocator *glbGpuAllocator;
-- 
GitLab