Skip to content

Commit

Permalink
Bug fixes and numpy 2.0 compatability. (#64)
Browse files Browse the repository at this point in the history
- Further tweaking of NLPAR GPU memory limits for Apple-ARM.
- Many small type fixes for numpy 2.0 compatibillty.
- Corrected GPU detection for distributed indexing.
- Fixed issue where slower machines would erroneously detect a GPU timeout.

Signed-off by: David Rowenhorst <david.rowenhorst@nrl.navy.mil>
  • Loading branch information
drowenhorst-nrl authored Jun 28, 2024
1 parent 93ee812 commit d39e7eb
Show file tree
Hide file tree
Showing 13 changed files with 43 additions and 25 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ Changelog
All notable changes to PyEBSDIndex will be documented in this file. The format is based
on `Keep a Changelog <https://keepachangelog.com/en/1.1.0>`_.

0.3.5 (2024-06-07)
==================

Fixed
-----
- Further tweaking of NLPAR GPU memory limits for Apple-ARM.
- Many small type fixes for numpy 2.0 compatibillty.
- Corrected GPU detection for distributed indexing.
- Fixed issue where slower machines would erroneously detect a GPU timeout.


0.3.4 (2024-06-07)
==================
Expand Down
2 changes: 1 addition & 1 deletion pyebsdindex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
]
__description__ = "Python based tool for Radon based EBSD indexing"
__name__ = "pyebsdindex"
__version__ = "0.3.4"
__version__ = "0.3.5"


# Try to import only once - also will perform check that at least one GPU is found.
Expand Down
13 changes: 9 additions & 4 deletions pyebsdindex/_ebsd_index_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from pyebsdindex import band_detect as band_detect

RAYIPADDRESS = '127.0.0.1'
#RAYIPADDRESS = '0.0.0.0'
OSPLATFORM = platform.system()
#if OSPLATFORM == 'Darwin':
# RAYIPADDRESS = '0.0.0.0' # the localhost address does not work on macOS when on a VPN
Expand Down Expand Up @@ -309,10 +310,13 @@ def index_pats_distributed(


if ngpu > 0:
ngpupro = max(12, ngpu*8) # number of processes that will serve data to the gpu
gpuratio = (12, ngpu*4)
if (platform.machine(), platform.system()) == ('x86_64', 'Darwin'):
gpuratio = (6, ngpu*6)
ngpupro = min(max(gpuratio), 12) # number of processes that will serve data to the gpu
#ngpupro = 8
if n_cpu_nodes < 8:
ngpupro = min(ngpupro,8)
ngpupro = min(ngpupro, n_cpu_nodes)
if n_cpu_nodes < 2:
ngpupro = 2
#if OSPLATFORM == 'Linux':
Expand Down Expand Up @@ -369,7 +373,7 @@ def index_pats_distributed(
# fall back to CPU only calculation.
clparamfunction = band_detect.getopenclparam
# Set up the jobs
njobs = (np.ceil(npats / chunksize)).astype(np.compat.long)
njobs = (np.ceil(npats / chunksize)).astype(np.int64)

p_indx_start_end = [
[i * chunksize + patstart, (i + 1) * chunksize + patstart, chunksize]
Expand Down Expand Up @@ -458,7 +462,7 @@ def index_pats_distributed(

#gpu_launched += 1

gpuwrker_cycles = 0
gpuwrker_cycles = -500
cpuwrker_cycles = 0

while ncpudone < njobs:
Expand Down Expand Up @@ -765,6 +769,7 @@ def __optimizegpuchunk__(indexer, ngpupro, gpu_id, clparam):
@ray.remote(num_cpus=1, num_gpus=1)
class GPUWorker:
def __init__(self, actorid=0, clparammodule=None, gpu_id=None, cudavis = '0'):
#del os.environ['CUDA_VISIBLE_DEVICES']
# sys.path.append(path.dirname(path.dirname(__file__))) # do this to help Ray find the program files
# import openclparam # do this to help Ray find the program files
# device, context, queue, program, mf
Expand Down
2 changes: 1 addition & 1 deletion pyebsdindex/band_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def find_bands(self, patternsIn, verbose=0, chunksize=-1, **kwargs):
chunksize = nPats
chunk_start_end = [[0,nPats]]
else:
nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)
chunk_start_end = [[i * chunksize, (i + 1) * chunksize] for i in range(nchunks)]
chunk_start_end[-1][1] = nPats

Expand Down
6 changes: 4 additions & 2 deletions pyebsdindex/ebsd_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,10 @@ def pat_reader(self, patStart=0, nPatToRead=1):
typeread = self.filedatatype
typebyte = self.filedatatype(0).nbytes

f.seek(int(nPerPat * patStart * typebyte),1)
readpats = np.fromfile(f,dtype=typeread,count=int(nPatToRead * nPerPat))

f.seek(int(np.int64(nPerPat) * np.int64(patStart) * typebyte),1)
readpats = np.fromfile(f,dtype=typeread,count=np.int64(np.int64(nPatToRead) * np.int64(nPerPat)))

readpats = readpats.reshape(nPatToRead,self.patternH,self.patternW)
f.close()
yx = np.unravel_index(np.arange(int(patStart), int(patStart+nPatToRead), dtype = np.uint64),
Expand Down
4 changes: 2 additions & 2 deletions pyebsdindex/opencl/band_detect_cl.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def find_bands(self, patternsIn, verbose=0, clparams=None, chunksize=528, useCPU
nchunks = 1
chunksize = nPats
else:
nchunks = (np.ceil(nPats / chunksize)).astype(np.compat.long)
nchunks = (np.ceil(nPats / chunksize)).astype(np.int64)

chunk_start_end = [[i * chunksize,(i + 1) * chunksize] for i in range(nchunks)]
chunk_start_end[-1][1] = nPats
Expand Down Expand Up @@ -270,7 +270,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False, b
#radon_gpu = cl.Buffer(ctx,mf.READ_WRITE,size=radon.nbytes)
#radon_gpu = cl.Buffer(ctx,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=radon)
image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image)
imstep = np.uint64(np.product(shapeIm[-2:]))
imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
tic = timer()

nImChunk = np.uint64(nImCL/clvtypesize)
Expand Down
2 changes: 1 addition & 1 deletion pyebsdindex/opencl/clkernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ __kernel void loaduint16( const __global ushort *im1, __global float *im1flt, co


// simple program to convert a float to float and transpose array
__kernel void loaduufloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
__kernel void loadfloat32( const __global float *im1, __global float *im1flt, const unsigned long int nImCL)
{
const unsigned long int x = get_global_id(0);
const unsigned long int y = get_global_id(1);
Expand Down
4 changes: 2 additions & 2 deletions pyebsdindex/opencl/nlpar_cl.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def calcsigma_cl(self,nn=1,saturation_protect=True,automask=True, normalize_d=Fa
#print(gpu_id)
clparams.get_context(gpu_id=gpu_id, kfile = 'clnlpar.cl')
clparams.get_queue()
target_mem = clparams.queue.device.max_mem_alloc_size//2
target_mem = min(clparams.queue.device.max_mem_alloc_size//2, np.int64(4e9))
ctx = clparams.ctx
prg = clparams.prg
queue = clparams.queue
Expand Down Expand Up @@ -400,7 +400,7 @@ def calcnlpar_cl(self, searchradius=None, lam = None, dthresh = None, saturation
#print(gpu_id)
clparams.get_context(gpu_id=gpu_id, kfile ='clnlpar.cl')
clparams.get_queue()
target_mem = clparams.queue.device.max_mem_alloc_size//2
target_mem = min(clparams.queue.device.max_mem_alloc_size//4, int(2e9))
ctx = clparams.ctx
prg = clparams.prg
queue = clparams.queue
Expand Down
1 change: 1 addition & 0 deletions pyebsdindex/opencl/openclparam.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def get_gpu(self):
else:
pass
self.gpu = gpu
self.ngpu = len(gpu)
return self.gpu


Expand Down
2 changes: 1 addition & 1 deletion pyebsdindex/opencl/radon_fast_cl.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def radon_fasterCL(self,image,padding = np.array([0,0]), fixArtifacts = False,
image_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=image_align)
rdnIndx_gpu = cl.Buffer(ctx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=self.indexPlan)

imstep = np.uint64(np.product(shapeIm[-2:]))
imstep = np.uint64(np.prod(shapeIm[-2:], dtype=int))
indxstep = np.uint64(self.indexPlan.shape[-1])
rdnstep = np.uint64(self.nRho * self.nTheta)

Expand Down
4 changes: 2 additions & 2 deletions pyebsdindex/pcopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,9 +435,9 @@ def initializeswarm(self, start=None, bounds=None):
self.vellimit = 4*np.mean(np.sqrt(np.sum(self.vel**2, axis=1)))


self.pbest = np.zeros(self.n_particles) + np.infty
self.pbest = np.zeros(self.n_particles) + np.inf
self.pbest_loc = np.copy(self.pos)
self.gbest = np.infty
self.gbest = np.inf
self.gbest_loc = start


Expand Down
14 changes: 7 additions & 7 deletions pyebsdindex/radon_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
#else:
#indx_x = np.ceil(a[i] * n + b1).astype(np.int64)
indx_x = np.round(a[i] * n + b1).astype(np.int64)
indx_x = np.where(indx_x < 0, outofbounds, indx_x)
indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x)
indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds)
indx_x = np.where(indx_x < 0, outofbounds, indx_x).astype(np.int64)
indx_x = np.where(indx_x >= self.imDim[1], outofbounds, indx_x).astype(np.int64)
indx1D = np.clip(indx_x+self.imDim[1]*n, 0, outofbounds).astype(np.int64)
# for j in range(self.nRho):
# indx_good = indx1D[j,:].flatten()
# whgood = np.nonzero(indx_good < outofbounds)[0]
Expand All @@ -151,10 +151,10 @@ def radon_plan_setup(self, image=None, imageDim=None, nTheta=None, nRho=None, rh
# indx1D[j, 0:whmask.size] = newindex[whmask]

self.indexPlan[:, i, 0:self.imDim[0]] = indx1D
tempindx = self.indexPlan.flatten()
mask = np.concatenate( (self.mask.flatten(), np.array([0,0])))
tempindx = self.indexPlan.flatten().astype(np.int64)
mask = np.concatenate( (self.mask.flatten().astype(np.int64), np.array([0,0], dtype=np.int64)))
tempindx = np.where(mask[tempindx] > 0, tempindx, outofbounds)
maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1])))
maskindex = np.concatenate((self.maskindex.flatten(), np.array([-1,-1]))).astype(np.int64)
tempindx = np.where(maskindex[tempindx] >= 0, maskindex[tempindx], outofbounds)
self.indexPlan = tempindx.reshape([self.nRho,self.nTheta,self.imDim.max()])
self.indexPlan.sort(axis = -1)
Expand Down Expand Up @@ -331,7 +331,7 @@ def radon2pole(self,bandData,PC=None,vendor='EDAX'):
stheta = np.sin(theta)
ctheta = np.cos(theta)

pctemp = np.asfarray(PC).copy()
pctemp = np.asarray(PC, dtype=np.float32).copy()
shapet = pctemp.shape
if ven != 'EMSOFT':
if len(shapet) < 2:
Expand Down
4 changes: 2 additions & 2 deletions pyebsdindex/tripletvote.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def build_trip_lib(self):
#print(indx0FID)
#This completely over previsions the arrays, this is essentially
#N Choose K with N = number of angles and K = 3
nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))/np.compat.long(math.factorial(3))
nlib = npoles*np.prod(np.arange(3, dtype=np.int64)+(nangs-2+1))//np.int64(math.factorial(3))
nlib = nlib.astype(int)

libANG = np.zeros((nlib, 3))
Expand Down Expand Up @@ -792,7 +792,7 @@ def _refine_orientation(self, bandnorms, whGood, polematch):
tic = timer()
poles = self.tripLib.completelib['polesCart']
nGood = whGood.size
n2Fit = np.int64(np.product(np.arange(2)+(nGood-2+1))/np.int64(2))
n2Fit = np.int64(np.prod(np.arange(2)+(nGood-2+1), dtype=int)//np.int64(2))
whGood = np.asarray(whGood,dtype=np.int64)
#AB, ABgood = self.orientation_refine_loops_am(nGood,whGood,poles,bandnorms,polematch,n2Fit)
# tic = timer()
Expand Down

0 comments on commit d39e7eb

Please sign in to comment.