diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index a5a0d15..62acd3c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,16 @@ Changelog
 All notable changes to PyEBSDIndex will be documented in this file. The format is based
 on `Keep a Changelog <https://keepachangelog.com/en/1.1.0>`_.
 
+0.3.5 (2024-06-07)
+==================
+
+Fixed
+-----
+- Further tweaking of NLPAR GPU memory limits for Apple-ARM.
+- Many small type fixes for numpy 2.0 compatibillty.
+- Corrected GPU detection for distributed indexing.
+- Fixed issue where slower machines would erroneously detect a GPU timeout.
+
 
 0.3.4 (2024-06-07)
 ==================
diff --git a/pyebsdindex/__init__.py b/pyebsdindex/__init__.py
index 53da16a..b8a23bf 100644
--- a/pyebsdindex/__init__.py
+++ b/pyebsdindex/__init__.py
@@ -7,7 +7,7 @@
 ]
 __description__ = "Python based tool for Radon based EBSD indexing"
 __name__ = "pyebsdindex"
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 
 
 # Try to import only once - also will perform check that at least one GPU is found.
diff --git a/pyebsdindex/_ebsd_index_parallel.py b/pyebsdindex/_ebsd_index_parallel.py
index 2be1994..34e6798 100644
--- a/pyebsdindex/_ebsd_index_parallel.py
+++ b/pyebsdindex/_ebsd_index_parallel.py
@@ -48,6 +48,7 @@
     from pyebsdindex import band_detect as band_detect
 
 RAYIPADDRESS = '127.0.0.1'
+#RAYIPADDRESS = '0.0.0.0'
 OSPLATFORM  = platform.system()
 #if OSPLATFORM  == 'Darwin':
 #    RAYIPADDRESS = '0.0.0.0'  # the localhost address does not work on macOS when on a VPN
@@ -309,10 +310,13 @@ def index_pats_distributed(
 
 
     if ngpu > 0:
-        ngpupro = max(12, ngpu*8)  # number of processes that will serve data to the gpu
+        gpuratio = (12, ngpu*4)
+        if (platform.machine(), platform.system()) == ('x86_64', 'Darwin'):
+            gpuratio = (6, ngpu*6)
+        ngpupro = min(max(gpuratio), 12)  # number of processes that will serve data to the gpu
         #ngpupro = 8
         if n_cpu_nodes < 8:
-            ngpupro = min(ngpupro,8)
+            ngpupro = min(ngpupro, n_cpu_nodes)
         if n_cpu_nodes < 2:
             ngpupro = 2
         #if OSPLATFORM == 'Linux':
@@ -369,7 +373,7 @@ def index_pats_distributed(
     # fall back to CPU only calculation.
     clparamfunction = band_detect.getopenclparam
     # Set up the jobs
-    njobs = (np.ceil(npats / chunksize)).astype(np.compat.long)
+    njobs = (np.ceil(npats / chunksize)).astype(np.int64)
 
     p_indx_start_end = [
         [i * chunksize + patstart, (i + 1) * chunksize + patstart, chunksize]
@@ -458,7 +462,7 @@ def index_pats_distributed(
 
         #gpu_launched += 1
 
-    gpuwrker_cycles = 0
+    gpuwrker_cycles = -500
     cpuwrker_cycles = 0
 
     while ncpudone < njobs:
@@ -765,6 +769,7 @@ def __optimizegpuchunk__(indexer, ngpupro, gpu_id, clparam):
 @ray.remote(num_cpus=1, num_gpus=1)
 class GPUWorker:
     def __init__(self, actorid=0, clparammodule=None, gpu_id=None, cudavis = '0'):
+        #del os.environ['CUDA_VISIBLE_DEVICES']
         # sys.path.append(path.dirname(path.dirname(__file__)))  # do this to help Ray find the program files
         # import openclparam # do this to help Ray find the program files
         # device, context, queue, program, mf
diff --git a/pyebsdindex/band_detect.py b/pyebsdindex/band_detect.py
index af5abb9..f1b222a 100644
--- a/pyebsdindex/band_detect.py
+++ b/pyebsdindex/band_detect.py
@@ -396,7 +396,7 @@ def find_bands(self, patternsIn, verbose=0, chunksize=-1,  **kwargs):
       chunksize = nPats
       chunk_start_end = [[0,nPats]]
     else:
-      nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
+      nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)
       chunk_start_end = [[i * chunksize, (i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
 
diff --git a/pyebsdindex/ebsd_pattern.py b/pyebsdindex/ebsd_pattern.py
index 54b84a5..a8bf20c 100644
--- a/pyebsdindex/ebsd_pattern.py
+++ b/pyebsdindex/ebsd_pattern.py
@@ -502,8 +502,10 @@ def pat_reader(self, patStart=0, nPatToRead=1):
     typeread = self.filedatatype
     typebyte = self.filedatatype(0).nbytes
 
-    f.seek(int(nPerPat * patStart * typebyte),1)
-    readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat))
+
+    f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1)
+    readpats = np.fromfile(f,dtype=typeread,count=np.int64(np.int64(nPatToRead) * np.int64(nPerPat)))
+
     readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW)
     f.close()
     yx = np.unravel_index(np.arange(int(patStart), int(patStart+nPatToRead), dtype = np.uint64),
diff --git a/pyebsdindex/opencl/band_detect_cl.py b/pyebsdindex/opencl/band_detect_cl.py
index 77f79a2..8830f97 100644
--- a/pyebsdindex/opencl/band_detect_cl.py
+++ b/pyebsdindex/opencl/band_detect_cl.py
@@ -87,7 +87,7 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
         nchunks = 1
         chunksize = nPats
       else:
-        nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
+        nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)
 
       chunk_start_end = [[i * chunksize,(i + 1) * chunksize] for i in range(nchunks)]
       chunk_start_end[-1][1] = nPats
@@ -270,7 +270,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes)
     #radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon)
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image)
-    imstep = np.uint64(np.product(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
     tic = timer()
 
     nImChunk = np.uint64(nImCL/clvtypesize)
diff --git a/pyebsdindex/opencl/clkernels.cl b/pyebsdindex/opencl/clkernels.cl
index b712d2f..b045f90 100644
--- a/pyebsdindex/opencl/clkernels.cl
+++ b/pyebsdindex/opencl/clkernels.cl
@@ -70,7 +70,7 @@ __kernel void loaduint16( const __global ushort *im1, __global float *im1flt, co
 
 
 // simple program to convert a float to float and transpose array
-__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
+__kernel void loadfloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
   {
   const unsigned long int x = get_global_id(0);
   const unsigned long int y = get_global_id(1);
diff --git a/pyebsdindex/opencl/nlpar_cl.py b/pyebsdindex/opencl/nlpar_cl.py
index a3307a2..28a239c 100644
--- a/pyebsdindex/opencl/nlpar_cl.py
+++ b/pyebsdindex/opencl/nlpar_cl.py
@@ -163,7 +163,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
     #print(gpu_id)
     clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
     clparams.get_queue()
-    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    target_mem = min(clparams.queue.device.max_mem_alloc_size//2, np.int64(4e9))
     ctx = clparams.ctx
     prg = clparams.prg
     queue = clparams.queue
@@ -400,7 +400,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
     #print(gpu_id)
     clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl')
     clparams.get_queue()
-    target_mem = clparams.queue.device.max_mem_alloc_size//2
+    target_mem = min(clparams.queue.device.max_mem_alloc_size//4, int(2e9))
     ctx = clparams.ctx
     prg = clparams.prg
     queue = clparams.queue
diff --git a/pyebsdindex/opencl/openclparam.py b/pyebsdindex/opencl/openclparam.py
index 28f77f8..fa0206d 100644
--- a/pyebsdindex/opencl/openclparam.py
+++ b/pyebsdindex/opencl/openclparam.py
@@ -88,6 +88,7 @@ def get_gpu(self):
     else:
       pass
     self.gpu = gpu
+    self.ngpu = len(gpu)
     return self.gpu
 
 
diff --git a/pyebsdindex/opencl/radon_fast_cl.py b/pyebsdindex/opencl/radon_fast_cl.py
index 5fc8b75..fd8b3b9 100644
--- a/pyebsdindex/opencl/radon_fast_cl.py
+++ b/pyebsdindex/opencl/radon_fast_cl.py
@@ -98,7 +98,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False,
     image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align)
     rdnIndx_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=self.indexPlan)
 
-    imstep = np.uint64(np.product(shapeIm[-2:]))
+    imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
     indxstep = np.uint64(self.indexPlan.shape[-1])
     rdnstep = np.uint64(self.nRho * self.nTheta)
 
diff --git a/pyebsdindex/pcopt.py b/pyebsdindex/pcopt.py
index 009f7b7..2320e19 100644
--- a/pyebsdindex/pcopt.py
+++ b/pyebsdindex/pcopt.py
@@ -435,9 +435,9 @@ def initializeswarm(self, start=None, bounds=None):
         self.vellimit = 4*np.mean(np.sqrt(np.sum(self.vel**2, axis=1)))
 
 
-        self.pbest = np.zeros(self.n_particles) + np.infty
+        self.pbest = np.zeros(self.n_particles) + np.inf
         self.pbest_loc = np.copy(self.pos)
-        self.gbest = np.infty
+        self.gbest = np.inf
         self.gbest_loc = start
 
 
diff --git a/pyebsdindex/radon_fast.py b/pyebsdindex/radon_fast.py
index 7ce1534..300d4fc 100644
--- a/pyebsdindex/radon_fast.py
+++ b/pyebsdindex/radon_fast.py
@@ -134,9 +134,9 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
         #else:
           #indx_x = np.ceil(a[i] * n + b1).astype(np.int64)
         indx_x = np.round(a[i] * n + b1).astype(np.int64)
-        indx_x = np.where(indx_x < 0, outofbounds, indx_x)
-        indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x)
-        indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds)
+        indx_x = np.where(indx_x < 0, outofbounds, indx_x).astype(np.int64)
+        indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x).astype(np.int64)
+        indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds).astype(np.int64)
         # for j in range(self.nRho):
         #   indx_good = indx1D[j,:].flatten()
         #   whgood = np.nonzero(indx_good < outofbounds)[0]
@@ -151,10 +151,10 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
         #       indx1D[j, 0:whmask.size] = newindex[whmask]
 
         self.indexPlan[:, i, 0:self.imDim[0]] = indx1D
-    tempindx = self.indexPlan.flatten()
-    mask = np.concatenate( (self.mask.flatten(), np.array([0,0])))
+    tempindx = self.indexPlan.flatten().astype(np.int64)
+    mask = np.concatenate( (self.mask.flatten().astype(np.int64), np.array([0,0], dtype=np.int64)))
     tempindx = np.where(mask[tempindx] > 0, tempindx, outofbounds)
-    maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1])))
+    maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1]))).astype(np.int64)
     tempindx = np.where(maskindex[tempindx] >= 0, maskindex[tempindx], outofbounds)
     self.indexPlan = tempindx.reshape([self.nRho,self.nTheta,self.imDim.max()])
     self.indexPlan.sort(axis = -1)
@@ -331,7 +331,7 @@ def radon2pole(self,bandData,PC=None,vendor='EDAX'):
     stheta = np.sin(theta)
     ctheta = np.cos(theta)
 
-    pctemp =  np.asfarray(PC).copy()
+    pctemp =  np.asarray(PC, dtype=np.float32).copy()
     shapet = pctemp.shape
     if ven != 'EMSOFT':
       if len(shapet) < 2:
diff --git a/pyebsdindex/tripletvote.py b/pyebsdindex/tripletvote.py
index 67fcbd1..1b39eb4 100644
--- a/pyebsdindex/tripletvote.py
+++ b/pyebsdindex/tripletvote.py
@@ -407,7 +407,7 @@ def build_trip_lib(self):
     #print(indx0FID)
     #This completely over previsions the arrays, this is essentially 
     #N Choose K with N = number of angles and K = 3
-    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.compat.long(math.factorial(3))
+    nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3))
     nlib = nlib.astype(int)
 
     libANG = np.zeros((nlib, 3))
@@ -792,7 +792,7 @@ def _refine_orientation(self, bandnorms, whGood, polematch):
     tic = timer()
     poles = self.tripLib.completelib['polesCart']
     nGood = whGood.size
-    n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2))
+    n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1), dtype=int)//np.int64(2))
     whGood = np.asarray(whGood,dtype=np.int64)
     #AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit)
     # tic = timer()