Merge branch 'testing' into 'master'

Testing See merge request !2

Merge branch 'testing' into 'master'
Testing See merge request !2
3524218d · Karsa Zoltán István · ae7ef0d9 · 1c755e5b · 3524218d · 3524218d
Commit 3524218d authored Jan 31, 2023 by Karsa Zoltán István
Hide whitespace changes
Inline Side-by-side

Showing with 114 additions and 26 deletions

cores.py
+32 -0

epgpu.cu
+14 -14

genax.py
+13 -5

gpu.py
+5 -2

tetrarun.py
+5 -3

utils.py
+45 -2

No files found.
--- a/cores.py
+++ b/cores.py
+from numba import cuda
+
+
+cc_cores_per_SM_dict = {
+    (2,0) : 32,
+    (2,1) : 48,
+    (3,0) : 192,
+    (3,5) : 192,
+    (3,7) : 192,
+    (5,0) : 128,
+    (5,2) : 128,
+    (6,0) : 64,
+    (6,1) : 128,
+    (7,0) : 64,
+    (7,5) : 64,
+    (8,0) : 64,
+    (8,6) : 128,
+    (8,9) : 128,
+    (9,0) : 128
+    }
+# the above dictionary should result in a value of "None" if a cc match 
+# is not found.  The dictionary needs to be extended as new devices become
+# available, and currently does not account for all Jetson devices
+device = cuda.get_current_device()
+my_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
+my_cc = device.compute_capability
+cores_per_sm = cc_cores_per_SM_dict.get(my_cc)
+total_cores = cores_per_sm*my_sms
+print("GPU compute capability: " , my_cc)
+print("GPU total number of SMs: " , my_sms)
+print("total cores: " , total_cores)
\ No newline at end of file
--- a/epgpu.cu
+++ b/epgpu.cu
@@ -190,13 +190,13 @@ __device__ void ABC_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
    vec3 AB(1.0/v, 0.0, 0.0);
    vec3 AC = C/v;

-    for (double i = 1.0; i < v; i++)
+    for (double i = 0.0001; i < v; i++)
    {
-        for (double j = 1.0; j < v; j++)
+        for (double j = 0.0001; j < v; j++)
        {
            vec3 K = i*AB + j * AC;
            vec3 L = (D - K)/w;
-            for (double k = 1.0; k < w; k++)
+            for (double k = 0.0001; k < w; k++)
            {
                vec3 Sv = K + L*k;
                int S = stabil_ep(Sv, C, D);
@@ -217,13 +217,13 @@ __device__ void BCD_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
    vec3 BC = (C - B) / v;
    vec3 BD = (D - B) / v;

-    for (double i = 1.0; i < v; i++)
+    for (double i = 0.0001; i < v; i++)
    {
-        for (double j = 1.0; j < v; j++)
+        for (double j = 0.0001; j < v; j++)
        {
            vec3 K = B + i * BC + j * BD;
            vec3 L = (A - K)/w;
-            for (double k = 1.0; k < w; k++)
+            for (double k = 0.0001; k < w; k++)
            {
                vec3 Sv = K + L*k;
                int S = stabil_ep(Sv, C, D);
@@ -244,13 +244,13 @@ __device__ void CDA_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
    vec3 CA = (A - C) / v;
    vec3 CD = (D - C) / v;

-    for (double i = 1.0; i < v; i++)
+    for (double i = 0.0001; i < v; i++)
    {
-        for (double j = 1.0; j < v; j++)
+        for (double j = 0.0001; j < v; j++)
        {
            vec3 K = C + i * CA + j * CD;
            vec3 L = (B - K)/w;
-            for (double k = 1.0; k < w; k++)
+            for (double k = 0.0001; k < w; k++)
            {
                vec3 Sv = K + L*k;
                int S = stabil_ep(Sv, C, D);
@@ -271,13 +271,13 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
    vec3 DA = (A - D) / v;
    vec3 DB = (B - D) / v;

-    for (double i = 1.0; i < v; i++)
+    for (double i = 0.0001; i < v; i++)
    {
-        for (double j = 1.0; j < v; j++)
+        for (double j = 0.0001; j < v; j++)
        {
            vec3 K = D + i * DA + j * DB;
            vec3 L = (C - K)/w;
-            for (double k = 1.0; k < w; k++)
+            for (double k = 0.0001; k < w; k++)
            {
                vec3 Sv = K + L*k;
                int S = stabil_ep(Sv, C, D);
@@ -292,12 +292,12 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
 }

 __global__ void gpu_egyensulyi(int v, int w, double* Cx_arr, double* Cy_arr, 
-                double* Dx_arr, double* Dy_arr, double* Dz_arr, int size_C, int size_D, char* egysulyi_mtx) {
+                double* Dx_arr, double* Dy_arr, double* Dz_arr, int size_C, int size_D, int lcm, char* egysulyi_mtx) {
    int pos = blockDim.x * blockIdx.x + threadIdx.x;
    if (pos >= size_C*size_D)
        return;
    vec3 C(Cx_arr[pos % size_C], Cy_arr[pos % size_C], 0.0);
-    vec3 D(Dx_arr[pos % size_D], Dy_arr[pos % size_D], Dz_arr[pos % size_D]);
+    vec3 D(Dx_arr[(pos + pos / lcm) % size_D], Dy_arr[(pos + pos / lcm) % size_D], Dz_arr[(pos + pos / lcm) % size_D]);

    ABC_oldal(v, w, C, D, egysulyi_mtx);
    BCD_oldal(v, w, C, D, egysulyi_mtx);

--- a/genax.py
+++ b/genax.py
@@ -28,11 +28,11 @@ void parosit(const double* x1, const double* x2, double* a, double* b, const int
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (m <= tid || m*m <= tid*m+m-1)
        return;
-    float alpha = x1[tid];
+    double alpha = x1[tid];
    if (m*m <= tid*m+m-1)
        return;
    for (int i = 0; i < m; i++) {
-        float betha = x2[i];
+        double betha = x2[i];
        if ((alpha + betha) < PI && betha >= alpha && alpha > 0.0) {
            a[tid*m+i] = alpha;
            b[tid*m+i] = betha;
@@ -51,9 +51,11 @@ void parosit2(const double* x1, const double* x2, double* a, double* b, const in
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (m <= tid || m*m <= tid*m+m-1)
        return;
-    float alpha = x1[tid];
+    double alpha = x1[tid];
+    if (m*m <= tid*m+m-1)
+        return;
    for (int i = 0; i < m; i++) {
-        float betha = x2[i];
+        double betha = x2[i];
        if ((alpha + betha) < PI && alpha > 0.0) {
            a[tid*m+i] = alpha;
            b[tid*m+i] = betha;
@@ -95,8 +97,10 @@ def angles_alap(anglestopick, plot = False):
    tCy = tgtA / mtgt
    tCx = tCy / tgtA

-    Cy = cp.concatenate((hCy, tCy), axis=None)
    Cx = cp.concatenate((hCx, tCx), axis=None)
+    Cx = cp.append(Cx, [0.5], axis=False)
+    Cy = cp.concatenate((hCy, tCy), axis=None)
+    Cy = cp.append(Cy, [cp.sqrt(3.0)/2.0], axis=False)

    return Cx, Cy

@@ -138,7 +142,10 @@ def angles_ratet(anglestopick, plot = False):
    sin = cp.sin(anglestopick)

    Dx = cp.outer(cp.full(anglestopick.size, 1.0, dtype=cp.float64), Ex).flatten()
+    Dx = cp.append(Dx, [0.5], axis=False)
    Dy = cp.outer(cos, Ey).flatten()
+    Dy = cp.append(Dy, [cp.sqrt(3.0)/6.0], axis=False)
    Dz = cp.outer(sin, Ey).flatten()
+    Dz = cp.append(Dz, [cp.sqrt(2.0/3.0)], axis=False)

    return Dx, Dy, Dz
\ No newline at end of file
--- a/gpu.py
+++ b/gpu.py
 from numba import cuda
 import cupy as cp
+from utils import compute_lcm

 with open('epgpu.cu') as f:
    code = f.read()
@@ -9,10 +10,12 @@ ep_pontok_module = cp.RawModule(code=code, options=('--std=c++11',), name_expres
 fun = ep_pontok_module.get_function(kers[0])

 def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
+    print(f"Cnt: {Cx.size}x{Dx.size}={Cx.size*Dx.size}")
    print("Res size (byte): ", Cx.size*Dx.size*4*4)
-    print(Cx.size, ",", Cy.size, ",", Dx.size, ",", Dy.size, ",", Dz.size)
+    lcm = compute_lcm(Cx.size, Dx.size)
+    #print(Cx.size, ",", Cy.size, ",", Dx.size, ",", Dy.size, ",", Dz.size)
    egyensulyi_mtx = cp.zeros((Cx.size*Dx.size, 4, 4), dtype=cp.int8)
    numBlock = int((Cx.size*Dx.size + 256 - 1) / 256)
-    fun((numBlock,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, Dx.size, egyensulyi_mtx))
+    fun((numBlock,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, Dx.size, lcm, egyensulyi_mtx))

    return egyensulyi_mtx
--- a/tetrarun.py
+++ b/tetrarun.py
@@ -2,6 +2,7 @@ import sys, getopt

 from genax import gen_angels_to_pick, angles_alap, angles_ratet
 from gpu import start_kernel
+from utils import convert, printresults

 def main(argv):
   outputfile = 'out.txt'
@@ -34,13 +35,13 @@ def main(argv):
         PLOT = True

   space = gen_angels_to_pick(n, PLOT)
+
   Cx, Cy = angles_alap(space, PLOT)
-   
+
   Dx, Dy, Dz = angles_ratet(space)

   res = start_kernel(Cx, Cy, Dx, Dy, Dz, v, w)
-
-   #print(res)
+   printresults(res)

 if __name__ == "__main__":
   main(sys.argv[1:])
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
 import cupy as cp
+import numpy as np

 def expSpace(min, max, N, exponentialliness = 20.0):
    LinVec = cp.linspace(0, cp.log10(exponentialliness+1, dtype=cp.float64),N, dtype=cp.float64)
-    return (max-min)/exponentialliness * (10.0**LinVec - 1) + min
\ No newline at end of file
+    return (max-min)/exponentialliness * (10.0**LinVec - 1) + min
+
+
+def convert(egyensulyi_mtx):
+    parok = np.empty([0], dtype=np.int8)
+
+    for i in egyensulyi_mtx:
+        for S in range(0, 4):
+            for U in range(0, 4):
+                if i[S][U] == 1:
+                    parok = np.append(parok, S+1)
+                    parok = np.append(parok, U+1)
+        parok = np.append(parok, 0)
+        parok = np.append(parok, 0)
+    
+    N = parok.size
+    return np.resize(parok, (int(N/2), 2))
+
+def printresults(egyensulyi_mtx):
+    ossz = 0
+    for i in egyensulyi_mtx:
+        parok = np.empty([0], dtype=np.int8)
+        for S in range(0, 4):
+            for U in range(0, 4):
+                if i[S][U] == 1:
+                    parok = np.append(parok, S+1)
+                    parok = np.append(parok, U+1)
+        N = parok.size
+        print(f"{int(N/2)}x2")
+        print(np.resize(parok, (int(N/2), 2)))
+        print()
+
+def compute_lcm(x, y):
+   if x > y:
+       greater = x
+   else:
+       greater = y
+   while(True):
+       if((greater % x == 0) and (greater % y == 0)):
+           lcm = greater
+           break
+       greater += 1
+   return lcm
\ No newline at end of file