Commit 3524218d by Karsa Zoltán István

Merge branch 'testing' into 'master'

Testing

See merge request !2
parents ae7ef0d9 1c755e5b
from numba import cuda
cc_cores_per_SM_dict = {
(2,0) : 32,
(2,1) : 48,
(3,0) : 192,
(3,5) : 192,
(3,7) : 192,
(5,0) : 128,
(5,2) : 128,
(6,0) : 64,
(6,1) : 128,
(7,0) : 64,
(7,5) : 64,
(8,0) : 64,
(8,6) : 128,
(8,9) : 128,
(9,0) : 128
}
# the above dictionary should result in a value of "None" if a cc match
# is not found. The dictionary needs to be extended as new devices become
# available, and currently does not account for all Jetson devices
device = cuda.get_current_device()
my_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
my_cc = device.compute_capability
cores_per_sm = cc_cores_per_SM_dict.get(my_cc)
total_cores = cores_per_sm*my_sms
print("GPU compute capability: " , my_cc)
print("GPU total number of SMs: " , my_sms)
print("total cores: " , total_cores)
\ No newline at end of file
...@@ -190,13 +190,13 @@ __device__ void ABC_oldal(int v, int w, const vec3& C, const vec3& D, char* egys ...@@ -190,13 +190,13 @@ __device__ void ABC_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
vec3 AB(1.0/v, 0.0, 0.0); vec3 AB(1.0/v, 0.0, 0.0);
vec3 AC = C/v; vec3 AC = C/v;
for (double i = 1.0; i < v; i++) for (double i = 0.0001; i < v; i++)
{ {
for (double j = 1.0; j < v; j++) for (double j = 0.0001; j < v; j++)
{ {
vec3 K = i*AB + j * AC; vec3 K = i*AB + j * AC;
vec3 L = (D - K)/w; vec3 L = (D - K)/w;
for (double k = 1.0; k < w; k++) for (double k = 0.0001; k < w; k++)
{ {
vec3 Sv = K + L*k; vec3 Sv = K + L*k;
int S = stabil_ep(Sv, C, D); int S = stabil_ep(Sv, C, D);
...@@ -217,13 +217,13 @@ __device__ void BCD_oldal(int v, int w, const vec3& C, const vec3& D, char* egys ...@@ -217,13 +217,13 @@ __device__ void BCD_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
vec3 BC = (C - B) / v; vec3 BC = (C - B) / v;
vec3 BD = (D - B) / v; vec3 BD = (D - B) / v;
for (double i = 1.0; i < v; i++) for (double i = 0.0001; i < v; i++)
{ {
for (double j = 1.0; j < v; j++) for (double j = 0.0001; j < v; j++)
{ {
vec3 K = B + i * BC + j * BD; vec3 K = B + i * BC + j * BD;
vec3 L = (A - K)/w; vec3 L = (A - K)/w;
for (double k = 1.0; k < w; k++) for (double k = 0.0001; k < w; k++)
{ {
vec3 Sv = K + L*k; vec3 Sv = K + L*k;
int S = stabil_ep(Sv, C, D); int S = stabil_ep(Sv, C, D);
...@@ -244,13 +244,13 @@ __device__ void CDA_oldal(int v, int w, const vec3& C, const vec3& D, char* egys ...@@ -244,13 +244,13 @@ __device__ void CDA_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
vec3 CA = (A - C) / v; vec3 CA = (A - C) / v;
vec3 CD = (D - C) / v; vec3 CD = (D - C) / v;
for (double i = 1.0; i < v; i++) for (double i = 0.0001; i < v; i++)
{ {
for (double j = 1.0; j < v; j++) for (double j = 0.0001; j < v; j++)
{ {
vec3 K = C + i * CA + j * CD; vec3 K = C + i * CA + j * CD;
vec3 L = (B - K)/w; vec3 L = (B - K)/w;
for (double k = 1.0; k < w; k++) for (double k = 0.0001; k < w; k++)
{ {
vec3 Sv = K + L*k; vec3 Sv = K + L*k;
int S = stabil_ep(Sv, C, D); int S = stabil_ep(Sv, C, D);
...@@ -271,13 +271,13 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys ...@@ -271,13 +271,13 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
vec3 DA = (A - D) / v; vec3 DA = (A - D) / v;
vec3 DB = (B - D) / v; vec3 DB = (B - D) / v;
for (double i = 1.0; i < v; i++) for (double i = 0.0001; i < v; i++)
{ {
for (double j = 1.0; j < v; j++) for (double j = 0.0001; j < v; j++)
{ {
vec3 K = D + i * DA + j * DB; vec3 K = D + i * DA + j * DB;
vec3 L = (C - K)/w; vec3 L = (C - K)/w;
for (double k = 1.0; k < w; k++) for (double k = 0.0001; k < w; k++)
{ {
vec3 Sv = K + L*k; vec3 Sv = K + L*k;
int S = stabil_ep(Sv, C, D); int S = stabil_ep(Sv, C, D);
...@@ -292,12 +292,12 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys ...@@ -292,12 +292,12 @@ __device__ void DAB_oldal(int v, int w, const vec3& C, const vec3& D, char* egys
} }
__global__ void gpu_egyensulyi(int v, int w, double* Cx_arr, double* Cy_arr, __global__ void gpu_egyensulyi(int v, int w, double* Cx_arr, double* Cy_arr,
double* Dx_arr, double* Dy_arr, double* Dz_arr, int size_C, int size_D, char* egysulyi_mtx) { double* Dx_arr, double* Dy_arr, double* Dz_arr, int size_C, int size_D, int lcm, char* egysulyi_mtx) {
int pos = blockDim.x * blockIdx.x + threadIdx.x; int pos = blockDim.x * blockIdx.x + threadIdx.x;
if (pos >= size_C*size_D) if (pos >= size_C*size_D)
return; return;
vec3 C(Cx_arr[pos % size_C], Cy_arr[pos % size_C], 0.0); vec3 C(Cx_arr[pos % size_C], Cy_arr[pos % size_C], 0.0);
vec3 D(Dx_arr[pos % size_D], Dy_arr[pos % size_D], Dz_arr[pos % size_D]); vec3 D(Dx_arr[(pos + pos / lcm) % size_D], Dy_arr[(pos + pos / lcm) % size_D], Dz_arr[(pos + pos / lcm) % size_D]);
ABC_oldal(v, w, C, D, egysulyi_mtx); ABC_oldal(v, w, C, D, egysulyi_mtx);
BCD_oldal(v, w, C, D, egysulyi_mtx); BCD_oldal(v, w, C, D, egysulyi_mtx);
......
...@@ -28,11 +28,11 @@ void parosit(const double* x1, const double* x2, double* a, double* b, const int ...@@ -28,11 +28,11 @@ void parosit(const double* x1, const double* x2, double* a, double* b, const int
int tid = blockDim.x * blockIdx.x + threadIdx.x; int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (m <= tid || m*m <= tid*m+m-1) if (m <= tid || m*m <= tid*m+m-1)
return; return;
float alpha = x1[tid]; double alpha = x1[tid];
if (m*m <= tid*m+m-1) if (m*m <= tid*m+m-1)
return; return;
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
float betha = x2[i]; double betha = x2[i];
if ((alpha + betha) < PI && betha >= alpha && alpha > 0.0) { if ((alpha + betha) < PI && betha >= alpha && alpha > 0.0) {
a[tid*m+i] = alpha; a[tid*m+i] = alpha;
b[tid*m+i] = betha; b[tid*m+i] = betha;
...@@ -51,9 +51,11 @@ void parosit2(const double* x1, const double* x2, double* a, double* b, const in ...@@ -51,9 +51,11 @@ void parosit2(const double* x1, const double* x2, double* a, double* b, const in
int tid = blockDim.x * blockIdx.x + threadIdx.x; int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (m <= tid || m*m <= tid*m+m-1) if (m <= tid || m*m <= tid*m+m-1)
return; return;
float alpha = x1[tid]; double alpha = x1[tid];
if (m*m <= tid*m+m-1)
return;
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
float betha = x2[i]; double betha = x2[i];
if ((alpha + betha) < PI && alpha > 0.0) { if ((alpha + betha) < PI && alpha > 0.0) {
a[tid*m+i] = alpha; a[tid*m+i] = alpha;
b[tid*m+i] = betha; b[tid*m+i] = betha;
...@@ -95,8 +97,10 @@ def angles_alap(anglestopick, plot = False): ...@@ -95,8 +97,10 @@ def angles_alap(anglestopick, plot = False):
tCy = tgtA / mtgt tCy = tgtA / mtgt
tCx = tCy / tgtA tCx = tCy / tgtA
Cy = cp.concatenate((hCy, tCy), axis=None)
Cx = cp.concatenate((hCx, tCx), axis=None) Cx = cp.concatenate((hCx, tCx), axis=None)
Cx = cp.append(Cx, [0.5], axis=False)
Cy = cp.concatenate((hCy, tCy), axis=None)
Cy = cp.append(Cy, [cp.sqrt(3.0)/2.0], axis=False)
return Cx, Cy return Cx, Cy
...@@ -138,7 +142,10 @@ def angles_ratet(anglestopick, plot = False): ...@@ -138,7 +142,10 @@ def angles_ratet(anglestopick, plot = False):
sin = cp.sin(anglestopick) sin = cp.sin(anglestopick)
Dx = cp.outer(cp.full(anglestopick.size, 1.0, dtype=cp.float64), Ex).flatten() Dx = cp.outer(cp.full(anglestopick.size, 1.0, dtype=cp.float64), Ex).flatten()
Dx = cp.append(Dx, [0.5], axis=False)
Dy = cp.outer(cos, Ey).flatten() Dy = cp.outer(cos, Ey).flatten()
Dy = cp.append(Dy, [cp.sqrt(3.0)/6.0], axis=False)
Dz = cp.outer(sin, Ey).flatten() Dz = cp.outer(sin, Ey).flatten()
Dz = cp.append(Dz, [cp.sqrt(2.0/3.0)], axis=False)
return Dx, Dy, Dz return Dx, Dy, Dz
\ No newline at end of file
from numba import cuda from numba import cuda
import cupy as cp import cupy as cp
from utils import compute_lcm
with open('epgpu.cu') as f: with open('epgpu.cu') as f:
code = f.read() code = f.read()
...@@ -9,10 +10,12 @@ ep_pontok_module = cp.RawModule(code=code, options=('--std=c++11',), name_expres ...@@ -9,10 +10,12 @@ ep_pontok_module = cp.RawModule(code=code, options=('--std=c++11',), name_expres
fun = ep_pontok_module.get_function(kers[0]) fun = ep_pontok_module.get_function(kers[0])
def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w): def start_kernel(Cx, Cy, Dx, Dy, Dz, v, w):
print(f"Cnt: {Cx.size}x{Dx.size}={Cx.size*Dx.size}")
print("Res size (byte): ", Cx.size*Dx.size*4*4) print("Res size (byte): ", Cx.size*Dx.size*4*4)
print(Cx.size, ",", Cy.size, ",", Dx.size, ",", Dy.size, ",", Dz.size) lcm = compute_lcm(Cx.size, Dx.size)
#print(Cx.size, ",", Cy.size, ",", Dx.size, ",", Dy.size, ",", Dz.size)
egyensulyi_mtx = cp.zeros((Cx.size*Dx.size, 4, 4), dtype=cp.int8) egyensulyi_mtx = cp.zeros((Cx.size*Dx.size, 4, 4), dtype=cp.int8)
numBlock = int((Cx.size*Dx.size + 256 - 1) / 256) numBlock = int((Cx.size*Dx.size + 256 - 1) / 256)
fun((numBlock,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, Dx.size, egyensulyi_mtx)) fun((numBlock,), (256,), (v, w, Cx, Cy, Dx, Dy, Dz, Cx.size, Dx.size, lcm, egyensulyi_mtx))
return egyensulyi_mtx return egyensulyi_mtx
...@@ -2,6 +2,7 @@ import sys, getopt ...@@ -2,6 +2,7 @@ import sys, getopt
from genax import gen_angels_to_pick, angles_alap, angles_ratet from genax import gen_angels_to_pick, angles_alap, angles_ratet
from gpu import start_kernel from gpu import start_kernel
from utils import convert, printresults
def main(argv): def main(argv):
outputfile = 'out.txt' outputfile = 'out.txt'
...@@ -34,13 +35,13 @@ def main(argv): ...@@ -34,13 +35,13 @@ def main(argv):
PLOT = True PLOT = True
space = gen_angels_to_pick(n, PLOT) space = gen_angels_to_pick(n, PLOT)
Cx, Cy = angles_alap(space, PLOT) Cx, Cy = angles_alap(space, PLOT)
Dx, Dy, Dz = angles_ratet(space) Dx, Dy, Dz = angles_ratet(space)
res = start_kernel(Cx, Cy, Dx, Dy, Dz, v, w) res = start_kernel(Cx, Cy, Dx, Dy, Dz, v, w)
printresults(res)
#print(res)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1:]) main(sys.argv[1:])
\ No newline at end of file
import cupy as cp import cupy as cp
import numpy as np
def expSpace(min, max, N, exponentialliness = 20.0): def expSpace(min, max, N, exponentialliness = 20.0):
LinVec = cp.linspace(0, cp.log10(exponentialliness+1, dtype=cp.float64),N, dtype=cp.float64) LinVec = cp.linspace(0, cp.log10(exponentialliness+1, dtype=cp.float64),N, dtype=cp.float64)
return (max-min)/exponentialliness * (10.0**LinVec - 1) + min return (max-min)/exponentialliness * (10.0**LinVec - 1) + min
\ No newline at end of file
def convert(egyensulyi_mtx):
parok = np.empty([0], dtype=np.int8)
for i in egyensulyi_mtx:
for S in range(0, 4):
for U in range(0, 4):
if i[S][U] == 1:
parok = np.append(parok, S+1)
parok = np.append(parok, U+1)
parok = np.append(parok, 0)
parok = np.append(parok, 0)
N = parok.size
return np.resize(parok, (int(N/2), 2))
def printresults(egyensulyi_mtx):
ossz = 0
for i in egyensulyi_mtx:
parok = np.empty([0], dtype=np.int8)
for S in range(0, 4):
for U in range(0, 4):
if i[S][U] == 1:
parok = np.append(parok, S+1)
parok = np.append(parok, U+1)
N = parok.size
print(f"{int(N/2)}x2")
print(np.resize(parok, (int(N/2), 2)))
print()
def compute_lcm(x, y):
if x > y:
greater = x
else:
greater = y
while(True):
if((greater % x == 0) and (greater % y == 0)):
lcm = greater
break
greater += 1
return lcm
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment