diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a5a0d15..62acd3c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,16 @@ Changelog All notable changes to PyEBSDIndex will be documented in this file. The format is based on `Keep a Changelog `_. +0.3.5 (2024-06-07) +================== + +Fixed +----- +- Further tweaking of NLPAR GPU memory limits for Apple-ARM. +- Many small type fixes for numpy 2.0 compatibillty. +- Corrected GPU detection for distributed indexing. +- Fixed issue where slower machines would erroneously detect a GPU timeout. + 0.3.4 (2024-06-07) ================== diff --git a/pyebsdindex/__init__.py b/pyebsdindex/__init__.py index 53da16a..b8a23bf 100644 --- a/pyebsdindex/__init__.py +++ b/pyebsdindex/__init__.py @@ -7,7 +7,7 @@ ] __description__ = "Python based tool for Radon based EBSD indexing" __name__ = "pyebsdindex" -__version__ = "0.3.4" +__version__ = "0.3.5" # Try to import only once - also will perform check that at least one GPU is found. diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py index 2be1994..34e6798 100644 --- a/pyebsdindex/_ebsd_index_parallel.py +++ b/pyebsdindex/_ebsd_index_parallel.py @@ -48,6 +48,7 @@ from pyebsdindex import band_detect as band_detect RAYIPADDRESS = '127.0.0.1' +#RAYIPADDRESS = '0.0.0.0' OSPLATFORM = platform.system() #if OSPLATFORM == 'Darwin': # RAYIPADDRESS = '0.0.0.0' # the localhost address does not work on macOS when on a VPN @@ -309,10 +310,13 @@ def index_pats_distributed( if ngpu > 0: - ngpupro = max(12, ngpu*8) # number of processes that will serve data to the gpu + gpuratio = (12, ngpu*4) + if (platform.machine(), platform.system()) == ('x86_64', 'Darwin'): + gpuratio = (6, ngpu*6) + ngpupro = min(max(gpuratio), 12) # number of processes that will serve data to the gpu #ngpupro = 8 if n_cpu_nodes < 8: - ngpupro = min(ngpupro,8) + ngpupro = min(ngpupro, n_cpu_nodes) if n_cpu_nodes < 2: ngpupro = 2 #if OSPLATFORM == 'Linux': @@ -369,7 +373,7 @@ def index_pats_distributed( # fall back to CPU only calculation. clparamfunction = band_detect.getopenclparam # Set up the jobs - njobs = (np.ceil(npats / chunksize)).astype(np.compat.long) + njobs = (np.ceil(npats / chunksize)).astype(np.int64) p_indx_start_end = [ [i * chunksize + patstart, (i + 1) * chunksize + patstart, chunksize] @@ -458,7 +462,7 @@ def index_pats_distributed( #gpu_launched += 1 - gpuwrker_cycles = 0 + gpuwrker_cycles = -500 cpuwrker_cycles = 0 while ncpudone < njobs: @@ -765,6 +769,7 @@ def __optimizegpuchunk__(indexer, ngpupro, gpu_id, clparam): @ray.remote(num_cpus=1, num_gpus=1) class GPUWorker: def __init__(self, actorid=0, clparammodule=None, gpu_id=None, cudavis = '0'): + #del os.environ['CUDA_VISIBLE_DEVICES'] # sys.path.append(path.dirname(path.dirname(__file__))) # do this to help Ray find the program files # import openclparam # do this to help Ray find the program files # device, context, queue, program, mf diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py index af5abb9..f1b222a 100644 --- a/pyebsdindex/band_detect.py +++ b/pyebsdindex/band_detect.py @@ -396,7 +396,7 @@ def find_bands(self, patternsIn, verbose=0, chunksize=-1, **kwargs): chunksize = nPats chunk_start_end = [[0,nPats]] else: - nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long) + nchunks = (np.ceil(nPats / chunksize)).astype(np.int64) chunk_start_end = [[i * chunksize, (i + 1) * chunksize] for i in range(nchunks)] chunk_start_end[-1][1] = nPats diff --git a/pyebsdindex/ebsd_pattern.py b/pyebsdindex/ebsd_pattern.py index 54b84a5..a8bf20c 100644 --- a/pyebsdindex/ebsd_pattern.py +++ b/pyebsdindex/ebsd_pattern.py @@ -502,8 +502,10 @@ def pat_reader(self, patStart=0, nPatToRead=1): typeread = self.filedatatype typebyte = self.filedatatype(0).nbytes - f.seek(int(nPerPat * patStart * typebyte),1) - readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat)) + + f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1) + readpats = np.fromfile(f,dtype=typeread,count=np.int64(np.int64(nPatToRead) * np.int64(nPerPat))) + readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW) f.close() yx = np.unravel_index(np.arange(int(patStart), int(patStart+nPatToRead), dtype = np.uint64), diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py index 77f79a2..8830f97 100644 --- a/pyebsdindex/opencl/band_detect_cl.py +++ b/pyebsdindex/opencl/band_detect_cl.py @@ -87,7 +87,7 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU nchunks = 1 chunksize = nPats else: - nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long) + nchunks = (np.ceil(nPats / chunksize)).astype(np.int64) chunk_start_end = [[i * chunksize,(i + 1) * chunksize] for i in range(nchunks)] chunk_start_end[-1][1] = nPats @@ -270,7 +270,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes) #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon) image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image) - imstep = np.uint64(np.product(shapeIm[-2:])) + imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int)) tic = timer() nImChunk = np.uint64(nImCL/clvtypesize) diff --git a/pyebsdindex/opencl/clkernels.cl b/pyebsdindex/opencl/clkernels.cl index b712d2f..b045f90 100644 --- a/pyebsdindex/opencl/clkernels.cl +++ b/pyebsdindex/opencl/clkernels.cl @@ -70,7 +70,7 @@ __kernel void loaduint16( const __global ushort *im1, __global float *im1flt, co // simple program to convert a float to float and transpose array -__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL) +__kernel void loadfloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL) { const unsigned long int x = get_global_id(0); const unsigned long int y = get_global_id(1); diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py index a3307a2..28a239c 100644 --- a/pyebsdindex/opencl/nlpar_cl.py +++ b/pyebsdindex/opencl/nlpar_cl.py @@ -163,7 +163,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa #print(gpu_id) clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl') clparams.get_queue() - target_mem = clparams.queue.device.max_mem_alloc_size//2 + target_mem = min(clparams.queue.device.max_mem_alloc_size//2, np.int64(4e9)) ctx = clparams.ctx prg = clparams.prg queue = clparams.queue @@ -400,7 +400,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation #print(gpu_id) clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl') clparams.get_queue() - target_mem = clparams.queue.device.max_mem_alloc_size//2 + target_mem = min(clparams.queue.device.max_mem_alloc_size//4, int(2e9)) ctx = clparams.ctx prg = clparams.prg queue = clparams.queue diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py index 28f77f8..fa0206d 100644 --- a/pyebsdindex/opencl/openclparam.py +++ b/pyebsdindex/opencl/openclparam.py @@ -88,6 +88,7 @@ def get_gpu(self): else: pass self.gpu = gpu + self.ngpu = len(gpu) return self.gpu diff --git a/pyebsdindex/opencl/radon_fast_cl.py b/pyebsdindex/opencl/radon_fast_cl.py index 5fc8b75..fd8b3b9 100644 --- a/pyebsdindex/opencl/radon_fast_cl.py +++ b/pyebsdindex/opencl/radon_fast_cl.py @@ -98,7 +98,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align) rdnIndx_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=self.indexPlan) - imstep = np.uint64(np.product(shapeIm[-2:])) + imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int)) indxstep = np.uint64(self.indexPlan.shape[-1]) rdnstep = np.uint64(self.nRho * self.nTheta) diff --git a/pyebsdindex/pcopt.py b/pyebsdindex/pcopt.py index 009f7b7..2320e19 100644 --- a/pyebsdindex/pcopt.py +++ b/pyebsdindex/pcopt.py @@ -435,9 +435,9 @@ def initializeswarm(self, start=None, bounds=None): self.vellimit = 4*np.mean(np.sqrt(np.sum(self.vel**2, axis=1))) - self.pbest = np.zeros(self.n_particles) + np.infty + self.pbest = np.zeros(self.n_particles) + np.inf self.pbest_loc = np.copy(self.pos) - self.gbest = np.infty + self.gbest = np.inf self.gbest_loc = start diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py index 7ce1534..300d4fc 100644 --- a/pyebsdindex/radon_fast.py +++ b/pyebsdindex/radon_fast.py @@ -134,9 +134,9 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh #else: #indx_x = np.ceil(a[i] * n + b1).astype(np.int64) indx_x = np.round(a[i] * n + b1).astype(np.int64) - indx_x = np.where(indx_x < 0, outofbounds, indx_x) - indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x) - indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds) + indx_x = np.where(indx_x < 0, outofbounds, indx_x).astype(np.int64) + indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x).astype(np.int64) + indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds).astype(np.int64) # for j in range(self.nRho): # indx_good = indx1D[j,:].flatten() # whgood = np.nonzero(indx_good < outofbounds)[0] @@ -151,10 +151,10 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh # indx1D[j, 0:whmask.size] = newindex[whmask] self.indexPlan[:, i, 0:self.imDim[0]] = indx1D - tempindx = self.indexPlan.flatten() - mask = np.concatenate( (self.mask.flatten(), np.array([0,0]))) + tempindx = self.indexPlan.flatten().astype(np.int64) + mask = np.concatenate( (self.mask.flatten().astype(np.int64), np.array([0,0], dtype=np.int64))) tempindx = np.where(mask[tempindx] > 0, tempindx, outofbounds) - maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1]))) + maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1]))).astype(np.int64) tempindx = np.where(maskindex[tempindx] >= 0, maskindex[tempindx], outofbounds) self.indexPlan = tempindx.reshape([self.nRho,self.nTheta,self.imDim.max()]) self.indexPlan.sort(axis = -1) @@ -331,7 +331,7 @@ def radon2pole(self,bandData,PC=None,vendor='EDAX'): stheta = np.sin(theta) ctheta = np.cos(theta) - pctemp = np.asfarray(PC).copy() + pctemp = np.asarray(PC, dtype=np.float32).copy() shapet = pctemp.shape if ven != 'EMSOFT': if len(shapet) < 2: diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py index 67fcbd1..1b39eb4 100644 --- a/pyebsdindex/tripletvote.py +++ b/pyebsdindex/tripletvote.py @@ -407,7 +407,7 @@ def build_trip_lib(self): #print(indx0FID) #This completely over previsions the arrays, this is essentially #N Choose K with N = number of angles and K = 3 - nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.compat.long(math.factorial(3)) + nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3)) nlib = nlib.astype(int) libANG = np.zeros((nlib, 3)) @@ -792,7 +792,7 @@ def _refine_orientation(self, bandnorms, whGood, polematch): tic = timer() poles = self.tripLib.completelib['polesCart'] nGood = whGood.size - n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2)) + n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1), dtype=int)//np.int64(2)) whGood = np.asarray(whGood,dtype=np.int64) #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit) # tic = timer()