Bug repair (#60)

* Better memory management on smaller GPUs * Correct issue when non-numpy array patterns (like Dask arrays) are sent to be indexed. Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
USNavalResearchLaboratory · May 30, 2024 · ee544d9 · ee544d9
1 parent 8c09e6b
commit ee544d9
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 7 deletions.
diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
@@ -225,6 +225,8 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
   def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, background = None, returnBuff = True, clparams=None ):
     # this function executes the radon sumations on the GPU
     tic = timer()
+    image = np.asarray(image)
+
     # make sure we have an OpenCL environment
     if clparams is not None:
       if clparams.queue is None:

diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
@@ -264,6 +264,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
 
 
         sigmachunk_gpu =  cl.Buffer(ctx, mf.WRITE_ONLY, size=sigmachunk.nbytes)
+
         cl.enqueue_barrier(queue)
         prg.calcsigma(queue, (np.uint32(ncolchunk), np.uint32(nrowchunk)), None,
                                datapad_gpu, mask_gpu,sigmachunk_gpu,
@@ -404,7 +405,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     clvectlen = 16
 
 
-
+    # print("target mem:", target_mem)
     chunks = self._calcchunks( [pwidth, pheight], ncols, nrows, target_bytes=target_mem,
                               col_overlap=sr, row_overlap=sr)
     #print(chunks[2], chunks[3])
@@ -426,10 +427,14 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     nchunks = chunksize.size
     #return chunks, chunksize
     mxchunk = int(chunksize.max())
+    # print("max chunk:" , mxchunk)
+
     npadmx = clvectlen * int(np.ceil(float(mxchunk)*npat_point/ clvectlen))
 
     datapad_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
     datapadout_gpu = cl.Buffer(ctx, mf.READ_WRITE, size=int(npadmx) * int(4))
+    # print("data pad", datapad_gpu.size)
+    # print("data out", datapadout_gpu.size)
 
     nnn = int((2 * sr + 1) ** 2)
 
@@ -469,14 +474,15 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
 
         sigmachunk = np.ascontiguousarray(sigma[rstart:rend, cstart:cend].astype(np.float32))
         sigmachunk_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sigmachunk)
+        # print("sigma", sigmachunk_gpu.size)
         szdata = data.size
         npad = clvectlen * int(np.ceil(szdata / clvectlen))
 
         #datapad = np.zeros((npad), dtype=np.float32) + np.float32(mxval + 10)
         #datapad[0:szdata] = data.reshape(-1)
 
         data_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=data)
-
+        # print("data", data_gpu.size)
         if data.dtype.type is np.float32:
           prg.nlloadpat32flt(queue, (np.uint64(data.size),1), None, data_gpu, datapad_gpu, wait_for=[filldatain])
         if data.dtype.type is np.ubyte:

diff --git a/pyebsdindex/opencl/nlpar_clray.py b/pyebsdindex/opencl/nlpar_clray.py
@@ -119,9 +119,10 @@ def calcsigma_clray(self, nn=1, saturation_protect=True, automask=True, normaliz
                                          normalize_d=normalize_d,
                                          gpu_id=gpu_id, **kwargs)
 
-    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 3
-    max_mem = clparams.gpu[gpu_id].global_mem_size * 0.75
+    target_mem = clparams.gpu[gpu_id].max_mem_alloc_size // 2
+    max_mem = clparams.gpu[gpu_id].global_mem_size * 0.5
     if target_mem * ngpuwrker > max_mem:
+      #print('revisemem:')
       target_mem = max_mem / ngpuwrker
 
     patternfile = self.getinfileobj()
@@ -479,7 +480,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
                                          gpu_id= gpu_id)
 
     target_mem = clparams.gpu[gpu_id].max_mem_alloc_size//3
-    max_mem = clparams.gpu[gpu_id].global_mem_size*0.75
+    max_mem = clparams.gpu[gpu_id].global_mem_size*0.4
     if target_mem*ngpuwrker > max_mem:
       target_mem = max_mem/ngpuwrker
     #print(target_mem/1.0e9)
@@ -545,7 +546,7 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
         if len(jobqueue) > 0:
             if len(idlewrker) > 0:
                 wrker = idlewrker.pop()
-                job = jobqueue.pop()
+                job = jobqueue.pop(0)
 
                 tasks.append(wrker.runnlpar_chunk.remote(job, nlparobj=nlpar_remote))
                 busywrker.append(wrker)
@@ -561,6 +562,10 @@ def calcnlpar_clray(self, searchradius=None, lam = None, dthresh = None, saturat
                   ndone += 1
                   if verbose >= 2:
                     print("tiles complete: ", ndone, "/", njobs, sep='', end='\r')
+                else: #An error has occured ... hopefully just need a re-process.
+                  jobqueue.append(job)
+                  print(message)
+
     if verbose >= 2:
       print('\n', end='')
     return str(self.patternfileout.filepath)

diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py
@@ -25,7 +25,7 @@
 from os import path
 import pyopencl as cl
 from os import environ
-environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
+environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
 
 RADDEG = 180.0/np.pi
 DEGRAD = np.pi/180.0

diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
@@ -235,6 +235,7 @@ def radon_fast(self, imageIn, padding = np.array([0,0]), fixArtifacts = False,
 
   def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, background = None, normalization=True):
     tic = timer()
+
     shapeIm = np.shape(imageIn)
     if imageIn.ndim == 2:
       nIm = 1
@@ -244,11 +245,13 @@ def radon_faster(self,imageIn,padding = np.array([0,0]), fixArtifacts = False, b
       nIm = shapeIm[0]
     #  reform = False
 
+
     if background is None:
       image = (imageIn.reshape(-1)).astype(np.float32)
     else:
       image = imageIn - background
       image = (image.reshape(-1)).astype(np.float32)
+    image = np.asarray(image)
 
     nPx = shapeIm[-1]*shapeIm[-2]
     indxDim = np.asarray(self.indexPlan.shape)