Merge branch 'cupy' into 'master'

Cupy See merge request !1

Merge branch 'cupy' into 'master'
Cupy See merge request !1
ae7ef0d9 · Karsa Zoltán István · e9a4645a · c274dad0 · ae7ef0d9 · ae7ef0d9
Commit ae7ef0d9 authored Jan 25, 2023 by Karsa Zoltán István
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 466 additions and 139 deletions

.gitignore
+2 -0

epgpu.cu
+307 -0

genax.py
+95 -56

gpu.py
+13 -78

measure.sh
+15 -0

measure2.sh
+15 -0

stress.sh
+13 -0

tetrarun.py
+3 -3

utils.py
+3 -2

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,4 @@ coverage.xml
 # Sphinx documentation
 docs/_build/
+.vscode/
\ No newline at end of file
--- a/epgpu.cu
+++ b/epgpu.cu
--- a/genax.py
+++ b/genax.py
-import numpy, matplotlib.pyplot as plt
+import cupy as cp, matplotlib.pyplot as plt
 from numba import cuda
 from utils import expSpace
@@ -6,46 +6,87 @@ def gen_angels_to_pick(n, plot = False):
    if n % 2 == 0:
        raise "n%2==0"
    N = int((n + 3) / 2)
-    X1 = expSpace(0.0, numpy.pi/2.0, N)
+    X1 = expSpace(0.0, cp.pi/2.0, N)
-    X3 = expSpace(numpy.pi/2.0, numpy.pi, N)
+    X3 = expSpace(cp.pi/2.0, cp.pi, N)
-    X3 = -1*X3 + 3.0*numpy.pi/2.0
+    X3 = -1*X3 + 3.0*cp.pi/2.0
-    anglestopick = numpy.concatenate((X1, X3), axis=None)
+    anglestopick = cp.concatenate((X1, X3), axis=None)
-    anglestopick = numpy.unique(anglestopick) # Vigyázni vele!
+    anglestopick = cp.unique(anglestopick) # Vigyázni vele!
    anglestopick = anglestopick[1:-1]
    if plot:
-        Y = numpy.zeros(n)
+        Y = cp.zeros(n)
        plt.plot(anglestopick, Y, '|')
        plt.show()
    return anglestopick
+parosit = cp.RawKernel(r'''
+extern "C" 
+__global__ 
+void parosit(const double* x1, const double* x2, double* a, double* b, const int m, const double PI) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (m <= tid || m*m <= tid*m+m-1)
+        return;
+    float alpha = x1[tid];
+    if (m*m <= tid*m+m-1)
+        return;
+    for (int i = 0; i < m; i++) {
+        float betha = x2[i];
+        if ((alpha + betha) < PI && betha >= alpha && alpha > 0.0) {
+            a[tid*m+i] = alpha;
+            b[tid*m+i] = betha;
+        } else {
+            a[tid*m+i] = -1.0;
+            b[tid*m+i] = -1.0;
+        }
+    }
+}
+''', 'parosit')
+parosit2 = cp.RawKernel(r'''
+extern "C" 
+__global__ 
+void parosit2(const double* x1, const double* x2, double* a, double* b, const int m, const double PI) {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (m <= tid || m*m <= tid*m+m-1)
+        return;
+    float alpha = x1[tid];
+    for (int i = 0; i < m; i++) {
+        float betha = x2[i];
+        if ((alpha + betha) < PI && alpha > 0.0) {
+            a[tid*m+i] = alpha;
+            b[tid*m+i] = betha;
+        } else {
+            a[tid*m+i] = -1.0;
+            b[tid*m+i] = -1.0;
+        }
+    }
+}
+''', 'parosit2')
 def angles_alap(anglestopick, plot = False):
-    alpha_arr = numpy.array([])
+    m = anglestopick.size
-    beta_arr = numpy.array([])
+    alpha_arr = cp.zeros((m, m), dtype=cp.float64)
+    beta_arr = cp.zeros((m, m), dtype=cp.float64)
-    for alpha in anglestopick:
+    blocksize = int((m + 64 - 1) / 64)
-        for beta in anglestopick:
+    parosit((blocksize,), (64,), (anglestopick, anglestopick, alpha_arr, beta_arr, m, cp.pi))
-            if alpha + beta < numpy.pi and beta >= alpha and alpha != 0.0: # vigyázni
-                alpha_arr = numpy.insert(alpha_arr, 0, alpha)
+    tompa_beta_arr = beta_arr[beta_arr > cp.pi]
-                beta_arr = numpy.insert(beta_arr, 0, beta)
+    tompa_beta_mpi_arr = tompa_beta_arr - cp.pi/2
+    hegyes_beta_arr = beta_arr[(beta_arr <= cp.pi) & (beta_arr > 0.0)]
-    tompa_beta_arr = beta_arr[beta_arr > numpy.pi]
+    tompa_alpha_arr = alpha_arr[beta_arr > cp.pi]
-    tompa_beta_mpi_arr = tompa_beta_arr - numpy.pi/2
+    hegyes_alpha_arr = alpha_arr[(beta_arr <= cp.pi) & (beta_arr > 0.0)]
-    hegyes_beta_arr = beta_arr[beta_arr <= numpy.pi]
-    tompa_alpha_arr = alpha_arr[beta_arr > numpy.pi]
+    tghB = cp.tan(hegyes_beta_arr)
-    hegyes_alpha_arr = alpha_arr[beta_arr <= numpy.pi]
+    tghA = cp.tan(hegyes_alpha_arr)
+    tgtB_mpi = cp.tan(tompa_beta_mpi_arr)
-    tghB = numpy.tan(hegyes_beta_arr)
+    tgtA = cp.tan(tompa_alpha_arr)
-    tghA = numpy.tan(hegyes_alpha_arr)
-    tgtB_mpi = numpy.tan(tompa_beta_mpi_arr)
-    tgtA = numpy.tan(tompa_alpha_arr)
    # hegyes
-    sztgh = tghB*tghA
+    sztgh = tghB * tghA
    otgh = tghA + tghB
-    hCy = sztgh/otgh
+    hCy = sztgh / otgh
    hCx = hCy / tghA
    # tompa
@@ -54,32 +95,29 @@ def angles_alap(anglestopick, plot = False):
    tCy = tgtA / mtgt
    tCx = tCy / tgtA
-    Cy = numpy.concatenate((hCy, tCy), axis=None)
+    Cy = cp.concatenate((hCy, tCy), axis=None)
-    Cx = numpy.concatenate((hCx, tCx), axis=None)
+    Cx = cp.concatenate((hCx, tCx), axis=None)
    return Cx, Cy
 def angles_ratet(anglestopick, plot = False):
-    alpha_arr = numpy.array([])
+    m = anglestopick.size
-    beta_arr = numpy.array([])
+    alpha_arr = cp.zeros((m, m), dtype=cp.float64)
+    beta_arr = cp.zeros((m, m), dtype=cp.float64)
-    for alpha in anglestopick:
+    blocksize = int((m + 64 - 1) / 64)
-        for beta in anglestopick:
+    parosit2((blocksize,), (m,), (anglestopick, anglestopick, alpha_arr, beta_arr, m, cp.pi))
-            if alpha + beta < numpy.pi and alpha != 0.0: # vigyázni
-                alpha_arr = numpy.insert(alpha_arr, 0, alpha)
+    tompa_beta_arr = beta_arr[beta_arr > cp.pi]
-                beta_arr = numpy.insert(beta_arr, 0, beta)
+    tompa_beta_mpi_arr = tompa_beta_arr - cp.pi/2
+    hegyes_beta_arr = beta_arr[(beta_arr <= cp.pi) & (beta_arr > 0.0)]
-    tompa_beta_arr = beta_arr[beta_arr > numpy.pi]
+    tompa_alpha_arr = alpha_arr[beta_arr > cp.pi]
-    tompa_beta_mpi_arr = tompa_beta_arr - numpy.pi/2
+    hegyes_alpha_arr = alpha_arr[(beta_arr <= cp.pi) & (beta_arr > 0.0)]
-    hegyes_beta_arr = beta_arr[beta_arr <= numpy.pi]
-    tompa_alpha_arr = alpha_arr[beta_arr > numpy.pi]
+    tghB = cp.tan(hegyes_beta_arr)
-    hegyes_alpha_arr = alpha_arr[beta_arr <= numpy.pi]
+    tghA = cp.tan(hegyes_alpha_arr)
+    tgtB_mpi = cp.tan(tompa_beta_mpi_arr)
-    tghB = numpy.tan(hegyes_beta_arr)
+    tgtA = cp.tan(tompa_alpha_arr)
-    tghA = numpy.tan(hegyes_alpha_arr)
-    tgtB_mpi = numpy.tan(tompa_beta_mpi_arr)
-    tgtA = numpy.tan(tompa_alpha_arr)
    # hegyes
    sztgh = tghB*tghA
@@ -93,14 +131,14 @@ def angles_ratet(anglestopick, plot = False):
    tCy = tgtA / mtgt
    tCx = tCy / tgtA
-    Ey = numpy.concatenate((hCy, tCy), axis=None)
+    Ey = cp.concatenate((hCy, tCy), axis=None)
-    Ex = numpy.concatenate((hCx, tCx), axis=None)
+    Ex = cp.concatenate((hCx, tCx), axis=None)
-    cos = numpy.cos(anglestopick)
+    cos = cp.cos(anglestopick)
-    sin = numpy.sin(anglestopick)
+    sin = cp.sin(anglestopick)
-    Dx = numpy.outer(numpy.full(anglestopick.size, 1.0), Ex).flatten()
+    Dx = cp.outer(cp.full(anglestopick.size, 1.0, dtype=cp.float64), Ex).flatten()
-    Dy = numpy.outer(cos, Ey).flatten()
+    Dy = cp.outer(cos, Ey).flatten()
-    Dz = numpy.outer(sin, Ey).flatten()
+    Dz = cp.outer(sin, Ey).flatten()
    return Dx, Dy, Dz
\ No newline at end of file
--- a/gpu.py
+++ b/gpu.py
 from numba import cuda
-import numpy
+import cupy as cp
-def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
+with open('epgpu.cu') as f:
-    Cx_global = cuda.to_device(Cx)
+    code = f.read()
-    Cy_global = cuda.to_device(Cy)
-    Dx_global = cuda.to_device(Dx)
-    Dy_global = cuda.to_device(Dy)
-    Dz_global = cuda.to_device(Dz)
-    egyensulyi_mtx = cuda.device_array((Dz.size, 4, 4))
-    gpu_egyensuly[256, 256](v, w, Cx_global, Cy_global, Dx_global, Dy_global, Dz_global, egyensulyi_mtx)
-    return egyensulyi_mtx.copy_to_host()
-@cuda.jit
-def gpu_egyensuly(v, w, Cx, Cy, Dx, Dy, Dz, egyensulyi_mtx):
-    ABC_oldal(v, w, Cx, Cy, Dx, Dy, Dz, egyensulyi_mtx)
-@cuda.jit(device=True)
-def ABC_oldal(v, w, Cx_arr, Cy_arr, Dx_arr, Dy_arr, Dz_arr, egyensulyi_mtx):
-    pos = cuda.grid(1)
-    if pos >= Dx_arr.size:
-        return
-    Cx = Cx_arr[pos]
-    Cy = Cy_arr[pos]
-    Dx = Dx_arr[pos]
-    Dy = Dy_arr[pos]
-    Dz = Dz_arr[pos]
-    ABx = 1.0/v
-    ABy = 0.0
-    ABz = 0.0
-    ACx = (Cx - 0.0)/v
+kers = ('gpu_egyensulyi', )
-    ACy = (Cy - 0.0)/v
+ep_pontok_module = cp.RawModule(code=code, options=('--std=c++11',), name_expressions=kers)
-    ACz = 0.0            # (0.0 - 0.0)/v
+fun = ep_pontok_module.get_function(kers[0])
-    for i in range(v+1):
+def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
-        for j in range(v+1):
+    print("Res size (byte): ", Cx.size*Dx.size*4*4)
-            oKx = i * ABx + j * ACx
+    print(Cx.size, ",", Cy.size, ",", Dx.size, ",", Dy.size, ",", Dz.size)
-            oKy = i * ABy + j * ACy
+    egyensulyi_mtx = cp.zeros((Cx.size*Dx.size, 4, 4), dtype=cp.int8)
-            oKz = i * ABz + j * ACz
+    numBlock = int((Cx.size*Dx.size + 256 - 1) / 256)
-            Lx = (Dx - oKx)/w
+    fun((numBlock,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, Dx.size, egyensulyi_mtx))
-            Ly = (Dy - oKy)/w
-            Lz = (Dz - oKz)/w
-            for k in range(w+1):
-                Sx = oKx + Lx*k
-                Sy = oKy + Ly*k
-                Sz = oKz + Lz*k
-                S = stabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
-                U = instabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
-                H = nyereg_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz)
-                if S + U - H == 2:
-                    egyensulyi_mtx[pos, S, U] = 1
-# Sxyz - a skp pontja, (0,0,0), (0,0,1), C(x,y,0), D(x,y,z) a tetraéderhez tartozó csúcspontok
-# return stabil sp száma S súlypontnál
-@cuda.jit(device=True)
-def stabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz):
-    return 0
-# Sxyz - a skp pontja, (0,0,0), (0,0,1), C(x,y,0), D(x,y,z) a tetraéderhez tartozó csúcspontok
-# return instabil sp száma S a testre nézve
-@cuda.jit(device=True)
-def instabil_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz):
-    return 0
-# Sxyz - a skp pontja, (0,0,0), (0,0,1), C(x,y,0), D(x,y,z) a tetraéderhez tartozó csúcspontok
+    return egyensulyi_mtx
-# return nyereg sp száma S az IJKL testre nézve
-@cuda.jit(device=True)
-def nyereg_ep(Sx, Sy, Sz, Cx, Cy, Dx, Dy, Dz):
-    return 0
\ No newline at end of file
--- a/measure.sh
+++ b/measure.sh
+#!/bin/bash
+#SBATCH --job-name=gpgpu     # a job neve
+#SBATCH -N 1                 # hány node-ot szeretnénk használni
+#SBATCH -p gpu               # melyik partícióból
+#SBATCH --gres gpu               # melyik partícióból
+#SBATCH --time=20:00:00      # maximális idő
+#SBATCH -o politopok.out       # kimeneti fájl
+module load anaconda
+for n in {40..55}
+do
+    echo "python tetrarun.py -n $((n*2 + 1)) -v 10 -w 10"
+    time python tetrarun.py -n $((n*2 + 1)) -v 10 -w 10
+done
--- a/measure2.sh
+++ b/measure2.sh
+#!/bin/bash
+#SBATCH --job-name=gpgpu     # a job neve
+#SBATCH -N 1                 # hány node-ot szeretnénk használni
+#SBATCH -p gpu               # melyik partícióból
+#SBATCH --gres gpu               # melyik partícióból
+#SBATCH --time=20:00:00      # maximális idő
+#SBATCH -o politopok.out       # kimeneti fájl
+module load anaconda
+for n in {20..30}
+do
+    echo "python tetrarun.py -n $((n*2 + 1)) -v 10 -w 10"
+    time python tetrarun.py -n $((n*2 + 1)) -v 100 -w 100
+done
--- a/stress.sh
+++ b/stress.sh
+#bin/bash
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
+time python tetrarun.py -n 521 -v 100 -w 100
\ No newline at end of file
--- a/tetrarun.py
+++ b/tetrarun.py
@@ -32,16 +32,15 @@ def main(argv):
         outputfile = arg
      elif opt in ("-p", "--plot"):
         PLOT = True
-   print('Output file is: ', outputfile)
-   print('Conf: ', [n, v, w])
   space = gen_angels_to_pick(n, PLOT)
   Cx, Cy = angles_alap(space, PLOT)
   Dx, Dy, Dz = angles_ratet(space)
   res = start_kernel(Cx, Cy, Dx, Dy, Dz, v, w)
-   print(res)
+   #print(res)
 if __name__ == "__main__":
   main(sys.argv[1:])
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
-import numpy
+import cupy as cp
 def expSpace(min, max, N, exponentialliness = 20.0):
-    LinVec = numpy.linspace(0, numpy.log10(exponentialliness+1),N)
+    LinVec = cp.linspace(0, cp.log10(exponentialliness+1, dtype=cp.float64),N, dtype=cp.float64)
    return (max-min)/exponentialliness * (10.0**LinVec - 1) + min
\ No newline at end of file