c++ skeleton

1e55b867 · Zoltan Karsa · 8c0233d5 · 1e55b867 · 1e55b867 · 1e55b867
Commit 1e55b867 authored Jan 23, 2023 by Zoltan Karsa
Hide whitespace changes
Inline Side-by-side

Showing with 135 additions and 66 deletions

epgpu.cu
+110 -0

gpu.py
+12 -54

stress.sh
+11 -11

tetrarun.py
+2 -1

No files found.
--- a/epgpu.cu
+++ b/epgpu.cu
+//--------------------------
+struct vec3 {
+//--------------------------
+	double x, y, z;
+	__device__ vec3(double x0 = 0, double y0 = 0, double z0 = 0) { x = x0; y = y0; z = z0; }
+	__device__ vec3 operator*(double a) const { return vec3(x * a, y * a, z * a); }
+	__device__ vec3 operator/(double a) const { return vec3(x / a, y / a, z / a); }
+	__device__ vec3 operator+(const vec3& v) const { return vec3(x + v.x, y + v.y, z + v.z); }
+	__device__ vec3 operator-(const vec3& v) const { return vec3(x - v.x, y - v.y, z - v.z); }
+	__device__ vec3 operator*(const vec3& v) const { return vec3(x * v.x, y * v.y, z * v.z); }
+	__device__ vec3 operator-()  const { return vec3(-x, -y, -z); }
+    __device__ double& operator[](int i) { return *(&x + i); }
+};
+__device__ inline double dot(const vec3& v1, const vec3& v2) { return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z); }
+__device__ inline double length(const vec3& v) { return sqrtf(dot(v, v)); }
+__device__ inline vec3 normalize(const vec3& v) { return v * (1 / length(v)); }
+__device__ inline vec3 cross(const vec3& v1, const vec3& v2) {
+	return vec3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
+}
+__device__ inline vec3 operator*(double a, const vec3& v) { return vec3(v.x * a, v.y * a, v.z * a); }
+//---------------------------
+struct mat3 { // row-major matrix 4x4
+//---------------------------
+	vec3 rows[4];
+public:
+	__device__ mat3() {}
+	__device__ mat3(double m00, double m01, double m02,
+		double m10, double m11, double m12,
+		double m20, double m21, double m22,
+		double m30, double m31, double m32) {
+		rows[0][0] = m00; rows[0][1] = m01; rows[0][2] = m02; 
+		rows[1][0] = m10; rows[1][1] = m11; rows[1][2] = m12; 
+		rows[2][0] = m20; rows[2][1] = m21; rows[2][2] = m22; 
+		rows[3][0] = m30; rows[3][1] = m31; rows[3][2] = m32;
+	}
+	__device__ mat3(vec3 it, vec3 jt, vec3 kt) {
+		rows[0] = it; rows[1] = jt; rows[2] = kt;
+	}
+	__device__ vec3& operator[](int i) { return rows[i]; }
+	__device__ vec3 operator[](int i) const { return rows[i]; }
+	__device__ operator double*() const { return (double*)this; }
+};
+__device__ inline vec3 operator*(const vec3& v, const mat3& mat) {
+	return v.x * mat[0] + v.y * mat[1] + v.z * mat[2];
+}
+__device__ inline mat3 operator*(const mat3& left, const mat3& right) {
+	mat3 result;
+	for (int i = 0; i < 4; i++) result.rows[i] = left.rows[i] * right;
+	return result;
+}
+__device__ inline int stabil_ep(const vec3& S, const vec3& C, const vec3& D) {
+    // ABC oldalon
+    vec3 A(0, 0, 0), B(0, 0, 1.0);
+    vec3 planeN = normalize(cross(B, C));
+    double distance = dot(planeN, C);
+    double rate = dot(planeN, planeN);
+    double t = -(distance + dot(S, planeN)) / rate;
+    vec3 projABC = S + t*planeN;
+}
+__device__ void ABC_oldal(int v, int w, double* Cx_arr, double* Cy_arr, 
+                double* Dx_arr, double* Dy_arr, double* Dz_arr, int size, int* egysulyi_mtx) {
+    int pos = blockDim.x * blockIdx.x + threadIdx.x;
+    vec3 C(Cx_arr[pos], Cy_arr[pos], 0.0);
+    vec3 D(Dx_arr[pos], Dy_arr[pos], Dz_arr[pos]);
+    vec3 AB(1.0/v, 0.0, 0.0);
+    vec3 AC = C/v;
+    for (double i = 0; i < v + 1; i++)
+    {
+        for (double j = 0; j < v + 1; j++)
+        {
+            vec3 K = i*AB + j * AC;
+            vec3 L = (D - K)/w;
+            for (double k = 0; k < w + 1; k++)
+            {
+                vec3 Sv = K + L*k;
+                int S = 0;
+                int U = 0;
+                int H = 0;
+                egysulyi_mtx[pos*16+S*4+U] = 1;
+            }
+        }
+    }
+}
+__global__ void gpu_egyensulyi(int v, int w, double* Cx_arr, double* Cy_arr, 
+                double* Dx_arr, double* Dy_arr, double* Dz_arr, int size, int* egysulyi_mtx) {
+    int pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (pos >= size)
+        return;
+    ABC_oldal(v, w, Cx_arr, Cy_arr, Dx_arr, Dy_arr, Dz_arr, size, egysulyi_mtx);
+}
\ No newline at end of file
--- a/gpu.py
+++ b/gpu.py
 from numba import cuda
-import numpy
+import cupy as cp
-def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
+with open('epgpu.cu') as f:
-    Cx_global = cuda.to_device(Cx)
+    code = f.read()
-    Cy_global = cuda.to_device(Cy)
-    Dx_global = cuda.to_device(Dx)
-    Dy_global = cuda.to_device(Dy)
-    Dz_global = cuda.to_device(Dz)
-    egyensulyi_mtx = cuda.device_array((Dz.size, 4, 4))
-    gpu_egyensuly[256, 256](v, w, Cx_global, Cy_global, Dx_global, Dy_global, Dz_global, egyensulyi_mtx)
-    return egyensulyi_mtx.copy_to_host()
-@cuda.jit
-def gpu_egyensuly(v, w, Cx, Cy, Dx, Dy, Dz, egyensulyi_mtx):
-    ABC_oldal(v, w, Cx, Cy, Dx, Dy, Dz, egyensulyi_mtx)
+kers = ('gpu_egyensulyi', )
+ep_pontok_module = cp.RawModule(code=code, options=('--std=c++11',), name_expressions=kers)
+fun = ep_pontok_module.get_function(kers[0])
-@cuda.jit(device=True)
+def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
-def ABC_oldal(v, w, Cx_arr, Cy_arr, Dx_arr, Dy_arr, Dz_arr, egyensulyi_mtx):
-    pos = cuda.grid(1)
-    if pos >= Dx_arr.size:
-        return
-    Cx = Cx_arr[pos]
-    Cy = Cy_arr[pos]
-    Dx = Dx_arr[pos]
-    Dy = Dy_arr[pos]
-    Dz = Dz_arr[pos]
-    ABx = 1.0/v
-    ABy = 0.0
-    ABz = 0.0
-    ACx = (Cx - 0.0)/v
-    ACy = (Cy - 0.0)/v
-    ACz = 0.0            # (0.0 - 0.0)/v
-    for i in range(v+1):
-        for j in range(v+1):
-            oKx = i * ABx + j * ACx
-            oKy = i * ABy + j * ACy
-            oKz = i * ABz + j * ACz
-            Lx = (Dx - oKx)/w
-            Ly = (Dy - oKy)/w
-            Lz = (Dz - oKz)/w
-            for k in range(w+1):
+    egyensulyi_mtx = cp.zeros((Cx.size, 4, 4), dtype=cp.int32)
-                Sx = oKx + Lx*k
-                Sy = oKy + Ly*k
-                Sz = oKz + Lz*k
-                S = stabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
+    fun((1024,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, egyensulyi_mtx))
-                U = instabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
-                H = nyereg_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
-                if S + U - H == 2:
+    return egyensulyi_mtx
-                    egyensulyi_mtx[pos, S, U] = 1
@@ -67,6 +23,8 @@ def ABC_oldal(v, w, Cx_arr, Cy_arr, Dx_arr, Dy_arr, Dz_arr, egyensulyi_mtx):
 # return stabil sp száma S súlypontnál
 @cuda.jit(device=True)
 def stabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz):
+    # B és C helyvektorok
+    # ABC oldal normálvektora:
    return 0
 # Sxyz - a skp pontja, (0,0,0), (0,0,1), C(x,y,0), D(x,y,z) a tetraéderhez tartozó csúcspontok

--- a/stress.sh
+++ b/stress.sh
 #bin/bash
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
-time python tetrarun.py -n 321 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
\ No newline at end of file
--- a/tetrarun.py
+++ b/tetrarun.py
@@ -39,7 +39,7 @@ def main(argv):
   res = start_kernel(Cx, Cy, Dx, Dy, Dz, v, w)
-   #print(res)
+   # print(res)
 if __name__ == "__main__":
   main(sys.argv[1:])
\ No newline at end of file